# Montserrat event selector for Machine Learning
The aim of this code is to find the best N events of each type, and create a corresponding CSV file and data structure for entry into Alexis' and Marielle's AAA codes.

In [None]:
#!/usr/bin/env python

## SCAFFOLD. It keeps giving me the same Regional over and over.
# Is the CSV file corrupt? Or is there a bug in the code?

import os
from glob import glob
import pandas as pd
import numpy as np
import sys
LIBpath = os.path.join( os.getenv('HOME'),'src','kitchensinkGT', 'LIB')
sys.path.append(LIBpath)
from libseisGT import add_to_trace_history #, mulplt
from modutils import yn_choice

from obspy import read_inventory #, remove_response
from libMVO import fix_trace_id, inventory_fix_id_mvo, load_mvo_inventory
cwd = os.getcwd()
sys.path.append(cwd)
from libMontyML import read_volcano_def, build_master_event_catalog, parse_STATION0HYP, qc_event, \
     get_weighted_fingerprints, save_fingerprints, remove_marked_events, to_AAA, report_checked_events

SEISAN_DATA = os.path.join( os.getenv('HOME'),'DATA','MVO')
DB = 'MVOE_'

subclass_mapping = read_volcano_def() # subclasses allowed for classification
seisan_subclasses = subclass_mapping['subclass'].values.tolist() # append('g') as needed, it is not an allowed subclass
#seisan_etypes = subclass_mapping['etype'].values.tolist()
subclasses_for_ML = ['D', 'R', 'r', 'e', 'l', 'h', 't'] # subclasses allowed for Machine Learning
outfile = 'MVOE_catalog.csv'

if os.path.exists(outfile):
    dfall = pd.read_csv(outfile) # how do i ignore the index?
    # do the following until I learn how to ignore index. otherwise it adds a new column on each load.
    dfall = dfall[['filetime', 'Fs', 'bandratio_[0.8_4.0_16.0]',
       'bandratio_[1.0_6.0_11.0]', 'bw_max', 'bw_min', 'calib',
       'cft_peak_wmean', 'cft_std_wmean', 'coincidence_sum', 'day',
       'detection_quality', 'energy', 'hour', 'kurtosis', 'medianF', 'minute',
       'month', 'noise_level', 'num_gaps', 'num_traces', 'offtime', 'ontime',
       'path', 'peakA', 'peakF', 'peakamp', 'peaktime', 'percent_availability',
       'quality', 'sample_lower_quartile', 'sample_max', 'sample_mean',
       'sample_median', 'sample_min', 'sample_rms', 'sample_stdev',
       'sample_upper_quartile', 'second', 'sfile', 'signal_level', 'skewness',
       'snr', 'starttime', 'subclass', 'trigger_duration', 'year', 'D', 'R',
       'r', 'e', 'l', 'h', 't', 'new_subclass', 'weight', 'checked', 'split',
       'delete', 'ignore']]
    # removed twin as it is missing
else:
    master_event_catalog = 'MVOE_catalog_original.csv'
    
    # SCAFFOLD - the twin column no longer seems to exist
    dfall = build_master_event_catalog(SEISAN_DATA, DB, master_event_catalog, subclasses_for_ML)

station0hypfile = os.path.join(SEISAN_DATA, 'DAT', 'STATION0_MVO.HYP')
station_locationsDF = parse_STATION0HYP(station0hypfile)

###
#fingerprints = get_weighted_fingerprints(dfall, subclasses_for_ML, N=300, exclude_checked=False)
#one_event_df, quit = qc_event(dfall, subclasses_for_ML, seisan_subclasses, fingerprints, SEISAN_DATA, station_locationsDF)

iterate_again = True # changed this back to do the loop
while iterate_again:

    # get/update the fingerprints of each event class
    fingerprints = get_weighted_fingerprints(dfall, subclasses_for_ML, N=100, exclude_checked=False)
    save_fingerprints(fingerprints, subclasses_for_ML)
    
    # manually QC the next event. each time we choose the class with least checked examples
    one_event_df, quit = qc_event(dfall, subclasses_for_ML, seisan_subclasses, fingerprints, SEISAN_DATA, station_locationsDF)
    if isinstance(one_event_df, pd.DataFrame):
        # now we must merge this back into dfall
        dfall.sort_index(inplace=True)
        dfall.update(one_event_df)  
    
        # save the data  
        dfall.to_csv(outfile, index=False)
    else:
        iterate_again=False
    if quit:
        iterate_again=False 
# remove events we marked for deletion, splitting or to ignore
dfsubset = remove_marked_events(dfall)

aaa_infile = 'MVOE_catalog_reclassified.csv' 
to_AAA(dfsubset, subclasses_for_ML, aaa_infile, SEISAN_DATA, ignore_extra_columns=False)
report_checked_events(dfall, subclasses_for_ML)

In [None]:
# When I work on hal, I will want to build a full catalog 
# from all the event CSV files. But I would lose my checked events.
# So this is how I took care of that before.
# 1. Move the catalog file to a new name, e.g. MVOE_catalog_previous.csv
# 2. Move the original catalog file also, e.g. MVOE_catalog_original_previous.csv
# 3. Run the code above but with iterate=False
# 4. Now we should have a new MVOE_catalog_original.csv. 
#    Copy that to MVOE_catalog.csv
# 5. Now we need to update the newest catalog using the oldest. 
#    Any rows with matching file times should be replaced
print(dfall.path)

oldcatfile = 'MVOE_catalog_previous.csv'
newcatfile = 'MVOE_catalog.csv'
newcat = pd.read_csv(newcatfile)
newcat.update(pd.read_csv(oldcatfile))
newcat.to_csv('MVOE_catalog.csv')

In [None]:
outfile = 'MVOE_catalog.csv'
dfall = pd.read_csv(outfile)
#df = dfall[dfall['filetime']=='2003-05-14T06:04:14.000000Z']
#df = dfall[dfall['filetime']=='1901-03-06T21:13:28.040000Z']
df = dfall[dfall['filetime']=='2001-11-06T21:13:28.040000Z']
print(len(dfall))
#df = dfall.iloc[0]
print(df)

In [None]:
print(len(dfall['filetime'].unique()))

In [None]:
print(len(dfall))

In [17]:
# it turns out that multiple Sfiles sometimes point to the same WAVfile
# this leads to non-unique filetime entries

# Indeed we now see there is a many-to-many relationship between (WAVfile) path and Sfile.

# reawav_MVOE_YYYYDD.csv: fields include sfile, (WAV) path and (WAV) filetime
# ^ this already establishes a link between every Sfile and DSN WavFile

# Better might be...
# Mapping.CSV: Sfile, SfileTime, DSNWavfile, DSNWavFileTime, ASNWavfile, ASNWavFileTime


import os, sys
import pandas as pd
LIBpath = os.path.join( os.getenv('HOME'),'src','kitchensinkGT', 'LIB')
sys.path.append(LIBpath)
import seisan_classes
SEISAN_DATA = os.path.join( os.getenv('HOME'),'DATA','MVO')
DB = 'MVOE_'

def sfile2spath(sfile):
    parts = sfile.split('.S')
    YYYY = parts[1][0:4]
    MM = parts[1][4:6]
    spath = os.path.join(SEISAN_DATA, 'REA', DB, YYYY, MM, sfile) 
    return spath    

catfile = 'catalog_all.csv'
dfall = pd.read_csv(catfile)

print('Number of rows %d' % len(dfall))
print('Number of unique filetimes %d' % len(dfall['filetime'].unique()))
print('Number of unique paths %d' % len(dfall['path'].unique()))

for i,row in dfall.iterrows():
    if i>0:
        if row['delete']==True:
            continue
        if row['filetime']==lastrow['filetime']:
            rows=[lastrow, row]
            
            # Fix the S-file path. Sometimes we have two S-files that point to same DSN MVO WAVfile.
            # We have to examine both S-files, and determine what name they should have based on the WAVfiles
            # they point to. The key is the first WAVfile in time. Then we ultimately drop the later S-file.
            predicted_sfilepath = ""
            first_wavfile = None
            for r in rows:
                sfilepath = sfile2spath(r['sfile'])
                sfileobj = seisan_classes.Sfile(sfilepath, use_mvo_parser=True)
                for wavfile in sfileobj.wavfiles:
                    if not first_wavfile:
                        first_wavfile = wavfile
                    if first_wavfile.filetime > wavfile.filetime:
                        first_wavfile = wavfile
            predicted_sfilepath, was_found = first_wavfile.find_sfile(mainclass=sfileobj.mainclass[0])  
            dfall.loc[lasti, 'sfile'] = os.path.basename(predicted_sfilepath)
            
            # Fix the subclass
            if row['subclass']!=lastrow['subclass']:
                sfileobj = seisan_classes.Sfile(sfilepath, use_mvo_parser=True)
                dfall.loc[lasti, 'mainclass'] = sfileobj.mainclass
                dfall.loc[lasti, 'subclass'] = sfileobj.subclass
                
            # If either event was checked, assume the new_subclass is correct
            keep = lasti
            if dfall.loc[lasti, 'checked'] == False:
                dfall.loc[lasti, 'new_subclass'] = sfileobj.subclass
            if dfall.loc[i, 'checked'] == True:
                dfall.loc[lasti, 'new_subclass'] = dfall.loc[i, 'new_subclass']
                for subclass in ['R', 'r', 'e', 'l', 'h', 't']:
                    dfall.loc[lasti, subclass] = dfall.loc[i, subclass]
                dfall.loc[lasti, 'checked'] = True
                dfall.loc[lasti, 'ignore'] = dfall.loc[i, 'ignore']             

            # Mark the duplicate for removal    
            dfall.loc[i, 'delete'] = 2
            
            # Print the modified rows of the dataframe  
            print(dfall.loc[[lasti, i], ['sfile','subclass','new_subclass', 'checked','delete']])
            
    lastrow=row
    lasti=i
    
# Remove duplicated rows previously marked for deletion
dfall = dfall[dfall['delete']!=2]
print('Number of rows %d' % len(dfall))

outfile = catfile.replace('all', 'unique')
#dfall.to_csv(outfile, index=False)

Number of rows 17489
Number of unique filetimes 17489
Number of unique paths 17489
Number of rows 17489


In [None]:
# Next we want to figure out how to do this for the new, bigger catalog. And then merge the two catalogs.