# Reading Montserrat CSV catalog into DataFrame
---

**Last update:** 07/2021 - Glenn THOMPSON  
**Contact:** thompsong@usf.edu   
---

In [1]:
import os
import pandas as pd
import pickle
import numpy as np


# Change if you want your screen to keep quiet
# 0 = quiet
# 1 = in between
# 2 = detailed information
verbatim = 2

### PREPARE THE CATALOG DataFrame ###

PROJECTDIR = os.path.join(os.getenv('HOME'),'src', 'kitchensinkGT', 'PROJECTS', 'MontserratML')
csvfile_external = os.path.join(PROJECTDIR, 'MVO_labelled_events.csv')
csvfile_internal = './catalog/MVO_labelled_events_filtered.csv'

# copy the catalog CSV file from external to internal directory
os.system('cp %s %s' % (csvfile_external, csvfile_internal))

# read the catalog
cat = pd.read_csv(csvfile_external)

# count by (sub)class
print('%d events before filtering' % len(cat.index))
cat['class'].value_counts()

# fix f0 and f1 columns - not really sure what this does
#cat['f0'] = cat.apply(lambda x:eval(x['f0']), axis=1)
#cat['f1'] = cat.apply(lambda x:eval(x['f1']), axis=1)
cat['f0']=None
cat['f1']=None

# List of traceID occurrences
frames = []
for i,row in cat.iterrows():
    
    # change WAV path to picklepath
    picklepath = row['path'].replace('./WAV','eventFiles')+'.pickle'
    if not os.path.exists(picklepath):
        print('file not found :', picklefile)
        break
    cat.loc[i, 'path'] = picklepath
    
    # load trace CSV file
    tracecsv = picklepath.replace('.pickle','.csv')
    tracedf = pd.read_csv(tracecsv)
    tracedf['filetime'] = row['filetime']
    frames.append(tracedf)

# stitch all the trace CSV files together    
alltraces = pd.concat(frames, sort=True)
alltraces.to_csv('alltraceDFs.csv')

#alltraces.set_index('filetime', inplace=True) # we will need this later to remerge
#alltraces.sort_index(inplace=True)
print(alltraces['id'].value_counts())

# save the catalog to CSV and pickle file
cat.to_csv(csvfile_internal)
output_path_cat = csvfile_internal.replace('.csv', '.pd')
pickle.dump(cat, open(output_path_cat,'wb'))


522 events before filtering
MV.MBWH..SHZ     517
MV.MBLG..SHZ     474
MV.MBRY..SHZ     452
MV.MBGB..SHN     441
MV.MBGB..SHZ     436
MV.MBGB..SHE     432
MV.MBGH..SHZ     431
MV.MBGH..SHE     429
MV.MBGH..SHN     423
MV.MBGA..SHE     338
MV.MBGA..SHZ     337
MV.MBGA..SHN     335
MV.MBGE..SHN     334
MV.MBGE..SHE     333
MV.MBGE..SHZ     333
MV.MBBE..SHE     328
MV.MBBE..SHZ     326
MV.MBBE..SHN     325
MV.MBSS..SHZ     143
MV.MBRY..BHN     142
MV.MBRY..SHN     118
MV.MBRY..SHE     116
MV.MBMH..SHZ     114
MV.MBWH..BHN     103
MV.MBLG..BHN      97
MV.MBBY..SHZ      60
MV.MBBY..SHE      57
MV.MBBY..SHN      56
MV.MBMH..SHN      48
MV.MBRY..BHZ      44
MV.MBRY..BHE      44
MV.MBGB..BHN      43
MV.MBGB..BHZ      43
MV.MBGB..BHE      43
MV.MBLG.E.BDF     38
MV.MBMH..SHE      34
MV.MBGH..BHE      26
MV.MBGH..BHN      26
MV.MBGH..BHZ      26
MV.MBLG.S.BDF     18
MV.MBUN..SHZ       5
Name: id, dtype: int64


In [2]:
################################
# Machine learning and testing #
################################
import sys
sys.path.insert(0, '../automatic_processing')
#import tools
from config import Config
from analyzer import Analyzer

# Change if you want your screen to keep quiet
# 0 = quiet
# 1 = in between
# 2 = detailed information
verbatim = 2

# Init project with configuration file
config = Config('../config/general/newsettings_10.json', verbatim=verbatim)
config.readAndCheck()  

##########################
# Variables to loop over #
##########################

traceIDs = ['MV.MBWH..SHZ', 'MV.MBLG..SHZ', 'MV.MBRY..SHZ']
minWeights = range(4)
classes_to_include = [ ['l', 't'], ['e', 'r'], ['h', 'l', 't'], ['h', 'l', 't', 'r'], ['e', 'h', 'l', 't', 'r'] ]
"""
traceIDs = ['MV.MBWH..SHZ']
minWeights = range(1)
classes_to_include = [ ['e', 'h', 'l', 't', 'r'] ]
"""

#############
# Functions #
#############

def cat_filter_traceID(cat, alltraces, traceID):
    # subset catalog based on traceID
    matchingEvents = alltraces[alltraces['id']==traceID]
    for i, row in cat.iterrows():
        matching_indices = matchingEvents.index[matchingEvents['filetime']==row['filetime']].tolist()
        if len(matching_indices)==1:
            pass
        else:
            cat.drop(i, inplace=True)
    N = len(cat.index)
    #print('%d events after matching against traceID' % N)
    return N

def cat_filter_classes(cat, remove_classes):
    """
    for rmclass in remove_classes:
        print('Removing %s' % rmclass)
        cat = cat[cat['class']!=rmclass]
    """
    cat=cat[cat["class"].isin(remove_classes)]
    N = len(cat.index)
    #print('%d events after removing classes' % N)
    return cat, N

def cat_filter_weight(cat, minWeight):
    #if minWeight>0:
    #    cat = cat[cat['weight']>=minWeight]
    cat=cat[cat["weight"].isin(range(minWeight,13))]
    N = len(cat.index)
    #print('%d events after filtering above %d' % (N, minWeight))
    return cat, N

def cat_check_numbers(cat, minthresh = 20):
    df = cat.copy()
    tooSmall = False
    lengths = []
    for subclass in df['class'].unique():
        dfs = df[df['class']==subclass]
        N = len(dfs.index)
        if N<minthresh:
            tooSmall=True
        lengths.append(N)
    return tooSmall, lengths


#######################
# Looping starts here #
#######################
minPerClass = 30
counter = 0
results_list = []
for traceID in traceIDs:
    
    # what traceID are we looking for - read_montserrat needs this - should figure out how to write this into the config
    fptr = open('current_traceID.txt','w')
    fptr.write(traceID)
    fptr.close()
    
    for minWeight in minWeights:
        for include_classes in classes_to_include:
            # reload cat because we filter it down each time
            cat = pickle.load(open(output_path_cat,'rb'))
            #print(cat['class'].value_counts())
            print(traceID, include_classes, minWeight)
            
            results_dict = {}
            results_dict['traceID'] = traceID
            results_dict['classes'] = ','.join(include_classes)
            results_dict['minWeight'] = minWeight
            results_dict['NtraceID'] = cat_filter_traceID(cat, alltraces, traceID)
            cat, N = cat_filter_classes(cat, include_classes) 
            results_dict['Nclasses'] = N
            cat, N = cat_filter_weight(cat, minWeight)
            results_dict['Nweight'] = N
            tooSmall, lengths = cat_check_numbers(cat, minPerClass)
            results_dict['counts'] = str(lengths)
            
            results_dict['acc_mean'] = None
            results_dict['acc_std'] = None
            
            if N>=minPerClass*len(include_classes) and not tooSmall:
                #try:
                    print(cat.groupby('class').size())
                    analyzer = Analyzer(config, verbatim=verbatim, catalog=cat)
                    allData, allLabels, acc = analyzer.learn(config, returnData=True) # If you want the data
                    results_dict['acc_mean'] = np.round(np.mean(acc)*100, 1)
                    results_dict['acc_std'] = np.round(np.std(acc)*100, 1)                        
                    cat['predicted_class'] = allLabels
                    cat.to_csv(csvfile_internal.replace('.csv','_predicted_%d.csv' % counter)) 
            results_list.append(results_dict)
            counter += 1
            
resultsDF = pd.DataFrame(results_list)
resultsDF.to_csv('results.csv')
print('Done')





/Users/thompsong/src/AAA-master/config/specific/usecase1_continuous_classification/usecase1_EXAMPLE.json
Welcome to this automatic analysis architecture
Copyright: Marielle MALFANTE - GIPSA-Lab
Univ. Grenoble Alpes, CNRS, Grenoble INP, GIPSA-lab, 38000 Grenoble, France

 *** PROJECT CONFIGURATION 10 ***  
Configuration object from <path> ../config/general/newsettings_10.json, <configuration_number> 10,
 <general> {'project_root': '/Users/thompsong/src/AAA-master/', 'analysis_type': 'continuous', 'path_to_specific_settings_file': 'config/specific/usecase1_continuous_classification/usecase1_EXAMPLE.json', 'path_to_res': 'res/', 'path_to_visuals': 'fig/', 'path_to_res_to_review': 'res_to_review/'},
 <application> {'name': 'montserrat'},
 <preprocessing> {'energy_norm': True},
 <learning> {'algo': RandomForestClassifier(criterion='entropy'), 'cv': StratifiedShuffleSplit(n_splits=50, random_state=None, test_size=0.5,
            train_size=0.5), 'path_to_catalogue': 'catalog/MVO_labelled_ev

Cross-validation results:  77.0588235294  +/-  3.50755761176  %
          Predicted class
              e     r 
        e    36    11 
        r    12    43 
MV.MBWH..SHZ ['h', 'l', 't'] 1
class
h    106
l    106
t    102
dtype: int64
/Users/thompsong/src/AAA-master/MONTSERRAT/catalog/MVO_labelled_events_filtered.pd


 *** ANALYZER ***
Training data have been read and features have been extracted  (314, 120)
Computation time:  21.31330108642578
Model will be trained on 3 classes [0 1 2] ['h' 'l' 't']
Features have been scaled
Model has been trained:  RandomForestClassifier(criterion='entropy')
Computation time:  0.15529298782348633
Model score is:  1.0
and associated confusion matrix is:
          Predicted class
              h     l     t 
        h   106             
        l         106       
        t               102 
StratifiedShuffleSplit(n_splits=50, random_state=None, test_size=0.5,
            train_size=0.5)
Cross-validation results:  84.3694267516  +/-  2.35655017673  

Cross-validation results:  76.1069958848  +/-  1.8263632265  %
          Predicted class
              e     h     l     r     t 
        e    33     0     3     7       
        h     1    37     5     2     7 
        l     2     2    44     2       
        r    11     5     3    26     1 
        t     0     5           1    44 
MV.MBWH..SHZ ['l', 't'] 3
class
l    92
t    99
dtype: int64
/Users/thompsong/src/AAA-master/MONTSERRAT/catalog/MVO_labelled_events_filtered.pd


 *** ANALYZER ***
Training data have been read and features have been extracted  (191, 120)
Computation time:  11.84165620803833
Model will be trained on 2 classes [0 1] ['l' 't']
Features have been scaled
Model has been trained:  RandomForestClassifier(criterion='entropy')
Computation time:  0.08786940574645996
Model score is:  1.0
and associated confusion matrix is:
          Predicted class
              l     t 
        l    92       
        t          99 
StratifiedShuffleSplit(n_splits=50, random_state=None

Training data have been read and features have been extracted  (390, 120)
Computation time:  25.562803983688354
Model will be trained on 4 classes [0 1 2 3] ['h' 'l' 'r' 't']
Features have been scaled
Model has been trained:  RandomForestClassifier(criterion='entropy')
Computation time:  0.19550204277038574
Model score is:  1.0
and associated confusion matrix is:
          Predicted class
              h     l     r     t 
        h   100                   
        l         105             
        r               102       
        t                      83 
StratifiedShuffleSplit(n_splits=50, random_state=None, test_size=0.5,
            train_size=0.5)
Cross-validation results:  85.2615384615  +/-  2.42018685332  %
          Predicted class
              h     l     r     t 
        h    39     4     2     6 
        l     1    47     5       
        r     3     2    44     1 
        t     4     0     1    37 
MV.MBLG..SHZ ['e', 'h', 'l', 't', 'r'] 0
class
e     84
h    100
l    

Training data have been read and features have been extracted  (164, 120)
Computation time:  12.389270782470703
Model will be trained on 2 classes [0 1] ['e' 'r']
Features have been scaled
Model has been trained:  RandomForestClassifier(criterion='entropy')
Computation time:  0.12084102630615234
Model score is:  1.0
and associated confusion matrix is:
          Predicted class
              e     r 
        e    78       
        r          86 
StratifiedShuffleSplit(n_splits=50, random_state=None, test_size=0.5,
            train_size=0.5)
Cross-validation results:  77.4390243902  +/-  4.07579877246  %
          Predicted class
              e     r 
        e    32     8 
        r    11    32 
MV.MBLG..SHZ ['h', 'l', 't'] 2
class
h    98
l    98
t    82
dtype: int64
/Users/thompsong/src/AAA-master/MONTSERRAT/catalog/MVO_labelled_events_filtered.pd


 *** ANALYZER ***
Training data have been read and features have been extracted  (278, 120)
Computation time:  19.31305193901062
Model 

Model has been trained:  RandomForestClassifier(criterion='entropy')
Computation time:  0.23320913314819336
Model score is:  1.0
and associated confusion matrix is:
          Predicted class
              e     h     l     r     t 
        e    74                         
        h          98                   
        l                91             
        r                      75       
        t                            80 
StratifiedShuffleSplit(n_splits=50, random_state=None, test_size=0.5,
            train_size=0.5)
Cross-validation results:  79.7894736842  +/-  2.3215555563  %
          Predicted class
              e     h     l     r     t 
        e    31           1     5       
        h     0    39     4     1     5 
        l     1     1    41     2       
        r    10     3     3    20     1 
        t     0     4     0     0    36 
MV.MBRY..SHZ ['l', 't'] 0
class
l    89
t    86
dtype: int64
/Users/thompsong/src/AAA-master/MONTSERRAT/catalog/MVO_labelled_event

Cross-validation results:  87.4637681159  +/-  2.08640290564  %
          Predicted class
              h     l     t 
        h    41     5     4 
        l     4    41     0 
        t     4          39 
MV.MBRY..SHZ ['h', 'l', 't', 'r'] 1
class
h    100
l     89
r     94
t     86
dtype: int64
/Users/thompsong/src/AAA-master/MONTSERRAT/catalog/MVO_labelled_events_filtered.pd


 *** ANALYZER ***
Training data have been read and features have been extracted  (369, 120)
Computation time:  25.464305877685547
Model will be trained on 4 classes [0 1 2 3] ['h' 'l' 'r' 't']
Features have been scaled
Model has been trained:  RandomForestClassifier(criterion='entropy')
Computation time:  0.20598602294921875
Model score is:  1.0
and associated confusion matrix is:
          Predicted class
              h     l     r     t 
        h   100                   
        l          89             
        r                94       
        t                      86 
StratifiedShuffleSplit(n_splits=5

Cross-validation results:  97.675  +/-  1.56144964696  %
          Predicted class
              l     t 
        l    38     0 
        t     2    40 
MV.MBRY..SHZ ['e', 'r'] 3
class
e    76
r    71
dtype: int64
/Users/thompsong/src/AAA-master/MONTSERRAT/catalog/MVO_labelled_events_filtered.pd


 *** ANALYZER ***
Training data have been read and features have been extracted  (147, 120)
Computation time:  11.062157154083252
Model will be trained on 2 classes [0 1] ['e' 'r']
Features have been scaled
Model has been trained:  RandomForestClassifier(criterion='entropy')
Computation time:  0.13945698738098145
Model score is:  1.0
and associated confusion matrix is:
          Predicted class
              e     r 
        e    76       
        r          71 
StratifiedShuffleSplit(n_splits=50, random_state=None, test_size=0.5,
            train_size=0.5)
Cross-validation results:  76.1891891892  +/-  3.37448424217  %
          Predicted class
              e     r 
        e    34     4 
 