# Some tools for tooling around with the dfall dataframe
The aim of this code is to find the best N events of each type, and create a corresponding CSV file and data structure for entry into Alexis' and Marielle's AAA codes.

In [None]:
# Here we plot the first 1-minute of each selected event file, padding if needed
# We do this mainly to see the variable onset time of signals within event files
import pandas as pd
aaa_infile = 'aaa_labelled_events.csv' 
dfAAA = pd.read_csv(aaa_infile)
for i, row in dfAAA.iterrows():
    st = read(row['path'])
    st = st.select(station='MBLG', component='Z')
    if len(st):
        stime = st[0].stats.starttime
        st.trim(starttime=stime,endtime=stime+60,pad=True,fill_value=0)
        st.plot();
        print('subclass = %s' % row['class'])
        #dummy = input('ENTER to see next signal')

In [None]:
# See if Pickle files are corrected
import matplotlib.pyplot as plt
import pandas as pd
from obspy.core import read, UTCDateTime
outfile = 'catalog_all.csv'
dfall = pd.read_csv(outfile)
df = dfall.copy()
df.sort_values(by=['trigger_duration'],ascending=False)
for i,row in df.iterrows():
    abpath =row['path'].replace('./WAV', '/Users/thompsong/DATA/MVO/PICKLE') + '.pickle'
    st = read(abpath)
    for tr in st:
        print(tr.stats)
    st.plot()
    dummy = input('ENTER to see next event, or q to quit')  
    if dummy=='q':
        break    

In [None]:
# See if there is a relationship between detection window length and length of file for different subclasses
import matplotlib.pyplot as plt
import pandas as pd
from obspy.core import read, UTCDateTime
outfile = 'catalog_all.csv'
dfall = pd.read_csv(outfile)
df = dfall.copy()
for subclass in ['r','e','l','h','t']:
    df0 = df[df['new_subclass']==subclass]
    df0 = df0[df0['trigger_duration']>0]
    #print(df_subclass.columns)
    print(subclass)
    print(df0[['twin','trigger_duration']].describe())
    for i,row in df0.iterrows():
        abpath =row['path'].replace('./', '/Users/thompsong/DATA/MVO/')
        st = read(abpath)
        st = st.select(station='MBLG', component='Z')
        
        if len(st)==0:
            st = st.select(station='MBWH', component='Z')
        if len(st)==0:
            tr = st[0]
            st = Stream()
            st.append(tr)
        #st.plot(equal_scale=False)
        st.normalize()
        plt.plot(st[0].times(), st[0].data)
        ontime = UTCDateTime.strptime(row['ontime'], format='%Y-%m-%dT%H:%M:%S.%f%z')
        offtime = UTCDateTime.strptime(row['offtime'], format='%Y-%m-%dT%H:%M:%S.%f%z')
        filetime = UTCDateTime.strptime(row['filetime'], format='%Y-%m-%dT%H:%M:%S.%f%z')
        plt.vlines([ontime-filetime, offtime-filetime],-1,1,'r')
        plt.ylabel(st[0].id)
        plt.title('subclass = %s ' % subclass)
        plt.xlabel('Time (s)')
        
        plt.show()
        dummy = input('ENTER to see next event, or q to quit')  
        if dummy=='q':
            break

In [None]:
# See the events that I have marked for splitting
outfile = 'catalog_all.csv'
dfall = pd.read_csv(outfile)
df = dfall.copy()
df = df[df['split']==True]

In [None]:
import plotly.express as px
df = px.data.tips()
fig = px.box(df, x='time', y='total_bill')
fig.show()

In [None]:
outfile = 'catalog_all.csv'
dfall = pd.read_csv(outfile)
df = dfall.copy()
df = df[df['checked']==True]
df = df[df['ignore']==False]
df = df[df['delete']==False]
df = df[df['split']==False]
frames = []
for subclass in ['R', 'r', 'e', 'l', 'h', 't']:
    dfs = df[df['new_subclass']==subclass]
    print(subclass, len(dfs.index))
    frames.append(dfs)
newdf = pd.concat(frames)
for index, row in newdf.iterrows():
    print(row[['filetime', 'subclass', 'R', 'r', 'e', 'l', 'h', 't', 'new_subclass']])

In [None]:
# testing that the input parsing works
df = dfall.copy()

new_subclass = 't, 50, h, 49, 6'
spl = new_subclass.split(',') # split string to subclass probability list 
if len(spl) % 2 == 1:
    weight = int(spl.pop())
    print(weight)
spd = {spl[a].strip():spl[a + 1] for a in range(0, len(spl), 2)} # subclass probability dict
print(spd)
print(spd.keys())
for key in subclasses_for_ML:
    if key in spd.keys():
        val = int(spd[key])
    else:
        val = 0
    print(key, val)
keymax = max(spd, key=spd.get)
print('new_subclass = ',keymax)

In [None]:
# I am guessing a lot of the e and R events are being ignored because of the 60-s length limit.
# Remove that limit.
# So just look for checked events that are ignored and have a length of twin>60. And turn ignore to False.
df = dfall.copy()
df = df[df['checked']==True]
df = df[df['ignore']==True]
df = df[df['delete']==False]
df = df[df['split']==False]
df = df[df['twin']>=60.0]
print(df.groupby('new_subclass').size())
df['ignore']=False
dfall2 = dfall.copy()
dfall2.update(df)
print(dfall2[dfall2['checked']==True].groupby('new_subclass').size())
print(dfall2.iloc[0])

dfall2.to_csv(outfile, index=False)

In [None]:
# Flipping unchecked events of length>60 from ignored to not ignored
df = dfall.copy()
df = df[df['checked']==False]
df = df[df['ignore']==True]
df = df[df['twin']>=60.0]
print(df.groupby('new_subclass').size())
df['ignore']=False
dfall2 = dfall.copy()
dfall2.update(df)
print(dfall2[dfall2['checked']==False].groupby('new_subclass').size())
#print(dfall2.iloc[0])

dfall2.to_csv(outfile, index=False)

In [None]:
# I also need to apply a 100 year correction to any data from 1901! print(dfall.iloc[0])
# map is not working

In [4]:
import pandas as pd
outfile = 'catalog_all.csv'
dfall = pd.read_csv(outfile)
print(len(dfall.index))
print(dfall.groupby('subclass').size())

17496


KeyError: 'class'