# Montserrat event selector for Machine Learning
The aim of this code is to find the best N events of each type, and create a corresponding CSV file and data structure for entry into Alexis' and Marielle's AAA codes.

In [2]:
#!/usr/bin/env python
import os
from glob import glob
import pandas as pd
import numpy as np
import sys
LIBpath = os.path.join( os.getenv('HOME'),'src','kitchensinkGT', 'LIB')
sys.path.append(LIBpath)
from libseisGT import add_to_trace_history #, mulplt
from modutils import yn_choice

from obspy import read_inventory #, remove_response
from libMVO import fix_trace_id, inventory_fix_id_mvo, load_mvo_inventory
cwd = os.getcwd()
sys.path.append(cwd)
from libMontyML import read_volcano_def, build_master_event_catalog, parse_STATION0HYP, qc_event, \
     get_weighted_fingerprints, save_fingerprints, remove_marked_events, to_AAA, report_checked_events

SEISAN_DATA = os.path.join( os.getenv('HOME'),'DATA','MVO')
DB = 'MVOE_'

subclass_mapping = read_volcano_def() # subclasses allowed for classification
seisan_subclasses = subclass_mapping['subclass'].values.tolist() # append('g') as needed, it is not an allowed subclass
#seisan_etypes = subclass_mapping['etype'].values.tolist()
subclasses_for_ML = ['D', 'R', 'r', 'e', 'l', 'h', 't'] # subclasses allowed for Machine Learning
outfile = 'catalog_all.csv'

if os.path.exists(outfile):
    dfall = pd.read_csv(outfile) # how do i ignore the index?
    # do the following until I learn how to ignore index. otherwise it adds a new column on each load.
    dfall = dfall[['filetime', 'Fs', 'RSAM_high',
       'RSAM_low', 'band_ratio', 'bw_max', 'bw_min', 'calib', 'cft_peak_wmean',
       'cft_std_wmean', 'coincidence_sum', 'day', 'detection_quality',
       'energy', 'hour', 'kurtosis', 'medianF', 'minute', 'month', 'num_gaps',
       'num_traces', 'offtime', 'ontime', 'path', 'peakA', 'peakF', 'peakamp',
       'peaktime', 'percent_availability', 'quality', 'sample_lower_quartile',
       'sample_max', 'sample_mean', 'sample_median', 'sample_min',
       'sample_rms', 'sample_stdev', 'sample_upper_quartile', 'second',
       'sfile', 'skewness', 'starttime', 'subclass', 'trigger_duration',
       'twin', 'year', 'D', 'R', 'r', 'e', 'l', 'h', 't', 'new_subclass',
       'weight', 'checked', 'split', 'delete', 'ignore']]
else:
    master_event_catalog = 'catalog_all_original.csv'
    dfall = build_master_event_catalog(SEISAN_DATA, DB, master_event_catalog, subclasses_for_ML)

station0hypfile = os.path.join(SEISAN_DATA, 'DAT', 'STATION0_MVO.HYP')
station_locationsDF = parse_STATION0HYP(station0hypfile)

###
#fingerprints = get_weighted_fingerprints(dfall, subclasses_for_ML, N=300, exclude_checked=False)
#one_event_df, quit = qc_event(dfall, subclasses_for_ML, seisan_subclasses, fingerprints, SEISAN_DATA, station_locationsDF)

iterate_again = False # changed this back to do the loop
while iterate_again:

    # get/update the fingerprints of each event class
    fingerprints = get_weighted_fingerprints(dfall, subclasses_for_ML, N=300, exclude_checked=False)
    save_fingerprints(fingerprints, subclasses_for_ML)
    
    # manually QC the next event. each time we choose the class with least checked examples
    one_event_df, quit = qc_event(dfall, subclasses_for_ML, seisan_subclasses, fingerprints, SEISAN_DATA, station_locationsDF)
    if isinstance(one_event_df, pd.DataFrame):
        # now we must merge this back into dfall
        dfall.sort_index(inplace=True)
        dfall.update(one_event_df)  
    
        # save the data  
        dfall.to_csv(outfile, index=False)
    else:
        iterate_again=False
    if quit:
        iterate_again=False 
# remove events we marked for deletion, splitting or to ignore
dfsubset = remove_marked_events(dfall)

aaa_infile = 'MVO_labelled_events.csv' 
COPYDIR = os.path.join(os.getenv('HOME'),'Dropbox','MVO_labelled_events')
to_AAA(dfsubset, subclasses_for_ML, aaa_infile, SEISAN_DATA, ignore_extra_columns=False)
report_checked_events(dfall, subclasses_for_ML)

Removed events: Marked to:
- split  13
- delete  2
- ignore  27
Catalog down from 17496 to 17454 events
 
 
Now we have the following number of events by subclass:
- D: 0
- R: 5
- r: 114
- e: 93
- l: 106
- h: 107
- t: 102
Eliminating subclass R
The subclasses for machine learning are Drelht.
Removed subclasses R
Here is the FINAL list of events by subclass and whether they have been checked:
Event counts:
Checked events: 522
new_subclass
e     93
h    107
l    106
r    114
t    102
Name: path, dtype: int64
Events by weight / quality threshold
weight
1.0     33
2.0     25
3.0     45
4.0     22
5.0     47
6.0     76
7.0     93
8.0     77
9.0     90
10.0    12
11.0     1
12.0     1
Name: path, dtype: int64
Catalog CSV saved to  MVO_labelled_events.csv
total checked events = 584
total classified events = 542
D 0
R 5
r 114
e 93
l 106
h 107
t 102
total events matching ML subclasses = 527
total reclassified events = 115
total already correctly classified events = 412
Error rate = 21.8%
527 52