# Merge catalogs
Reconcile full catalog on hal/newton in Fall 2021 with the incomplete but partially reclassified catalog from my laptop whilst in Paris in July 2021
Need to reconcile reawav_MVOE_all.csv (concatenated reawav_MVOE_YYYYMM.csv) on hal9000 against catalog_unique.csv which came from processing in Paris 
on my laptop.

Need to merge the new_subclass, weight, checked, split, delete and ignore columns.
Should probably match on the WAVfile path.
Do I need to merge D, R, r, e, l, h and t columns?

# 1. Concatenate the catalog YYYYMM CSV files into original master catalog

In [10]:
import os
import sys
import pandas as pd
from glob import glob

def subset_columns(df):
    good_columns = []
    for thiscol in df.columns:
        if not 'ntitle' in thiscol:
            if not 'Unname' in thiscol:
                if not thiscol=='index':
                    if not 'level_0' in thiscol:
                        good_columns.append(thiscol)
    df2 = df[good_columns] # subset to correct columns   
    #df2.set_index('path', inplace=True)
    #df2.sort_index(inplace=True)     
    return df2

def df2csv_without_index(df, csvfile):
    #df = df.reset_index()  
    #print(df.head())
    print(df.columns)
    df.drop(df.filter(regex="Unname"),axis=1, inplace=True)
    df.to_csv(csvfile, index=False)

def _df2file_without_index(df, catfile, indexcol=None):
    if not 'path' in df.columns:
        df = df.reset_index()  
    if indexcol:
        if not indexcol in df.columns:
            df.rename(columns = {'index':indexcol})
    df.drop(df.filter(regex="Unname"),axis=1, inplace=True)
    if catfile[-3:]=='csv':
        df.to_csv(catfile, index=False)
    if catfile[-3:]=='pkl':
        df.to_pickle(catfile)  
    
def number_of_checked_events(df):
    df2 = df[df['checked']==True]
    return len(df2.index)

def build_master_event_catalog(pandaSeisDBDir, SEISAN_DB, catalogfile, subclasses_for_ML, max_duration = 300):

    concatfile = 'catalog_all.csv'
    if os.path.exists(concatfile):
        dfall = pd.read_csv(concatfile)
    else:
    
        # load all the year/month CSV files
        csvfiles = glob(os.path.join(pandaSeisDBDir, 'catalog_%s[12][0-9][0-9][0-9][0-1][0-9].csv' % SEISAN_DB))
        frames = []
        if len(csvfiles)==0:
            print('No *.csv files found. Cannot proceed')
            exit()
        for csvfile in csvfiles:
            df = pd.read_csv(csvfile)
            frames.append(df) 
        dfall = pd.concat(frames, sort=True)
        _df2file_without_index(dfall.copy(), concatfile)
        
    print(dfall.columns)

    # replace loop above
    for mainclass in ['R', 'D']:
        dfall.loc[dfall['mainclass'] == mainclass, 'subclass'] = mainclass
    
    # Drop the mainclass column, as it is now superfluous.
    dfall.drop(columns=['mainclass'], inplace=True)
    
    # Add columns to assign a percentage for each subclass
    for subclass in subclasses_for_ML:
        dfall[subclass] = 0
    
    # But set column for actual subclass to 100%  
    for subclass in subclasses_for_ML:
        dfall.loc[dfall['subclass'] == subclass, subclass] = 100
        
    # Add a new_subclass column
    dfall['new_subclass'] = dfall['subclass']

    # Add weight column. I will give really clear events higher weight when I process them
    dfall['weight']=3 # weight for events I have not visually checked
    
    # Add column that records if event is checked
    dfall['checked']=False
    
    # Add column that records if event is marked for splitting
    dfall['split']=False    
    
    # Add column that records if event is marked for deletion
    dfall['delete']=False
    
    # Add column that records if event should be ignored
    # Ignore any events longer than 1-minute, as they are likely to contain multiple events 
    # or just be unhelpful for classifying short signals which are more common
    # SCAFFOLD - the twin column no longer seems to exist
    #dfall['ignore'] = dfall['twin']>max_duration
    dfall['ignore'] = False
    
    # Now we have a catalog dataframe we can work with. Let's save this.
    #dfall2 = dfall.reset_index()    
    #dfall2.to_csv(catalogfile)
    _df2file_without_index(dfall.copy(), catalogfile)
    
    return dfall

def summarize_df(df):
    print('Events: %d. Checked: %d ' % (len(df.index), number_of_checked_events(df)) )
    print(df.columns)
    dfhead = df.head()
    print(dfhead[['path','DSN_wavfile']])

In [2]:
SEISAN_DATA = os.path.join( os.getenv('HOME'),'DATA','MVO') # e.g. /home/user/seismo
pandaSeisDir = os.path.join(SEISAN_DATA, 'miniseed_c') # e.g. /home/user/seismo/pandaSeis
SEISAN_DB = 'MVOE_' # e.g. the seisan database name (e.g. MVOE_) under /home/user/seismo/WAV and /home/user/seismo/REA
pandaSeisDBDir = os.path.join(pandaSeisDir, SEISAN_DB) # e.g. /home/user/seismo/pandaSeis/MVOE_
AAA_DATA_DIR = os.path.join(SEISAN_DATA, 'MachineLearning', SEISAN_DB) # e.g. /home/user/seismo/MachineLearning/MVOE_
master_event_catalog_original = os.path.join(AAA_DATA_DIR, 'original', '%s11_master_catalog_rebuilt.csv' % SEISAN_DB)
subclasses_for_ML = ['D', 'R', 'r', 'e', 'l', 'h', 't'] # subclasses allowed for Machine Learning # add g here?

if os.path.exists(master_event_catalog_original):
    dfall = pd.read_csv(master_event_catalog_original)
else:
    dfall = build_master_event_catalog(pandaSeisDBDir, SEISAN_DB, master_event_catalog_original, subclasses_for_ML)

summarize_df(dfall)

Events: 231951. Checked: 0 
Index(['index', 'ASN_exists', 'ASN_wavfile', 'DSN_exists', 'DSN_wavfile',
       'Eseismic', 'Fs', 'R', 'bandratio_[0.8_4.0_16.0]',
       'bandratio_[1.0_6.0_11.0]', 'bw_max', 'bw_min', 'calib',
       'corrected_ASN_mseed', 'corrected_DSN_mseed', 'day', 'elev', 'energy',
       'filetime', 'hour', 'kurtosis', 'lat', 'lon', 'magA', 'magE', 'medianF',
       'minute', 'month', 'noise_level', 'num_gaps', 'num_traces', 'path',
       'peakA', 'peakF', 'peakamp', 'peaktime', 'percent_availability',
       'quality', 'sample_lower_quartile', 'sample_max', 'sample_mean',
       'sample_median', 'sample_min', 'sample_rms', 'sample_stdev',
       'sample_upper_quartile', 'second', 'sfile', 'signal_level', 'skewness',
       'snr', 'subclass', 'twin', 'year', 'D', 'r', 'e', 'l', 'h', 't',
       'new_subclass', 'weight', 'checked', 'split', 'delete', 'ignore'],
      dtype='object')
                        path                DSN_wavfile
0  9801-01-0002-04S.MVO_18_1

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
# ensure we always have the same index and columns
dfall2 = subset_columns(dfall)
summarize_df(dfall2)

Events: 231951. Checked: 0 
Index(['ASN_exists', 'ASN_wavfile', 'DSN_exists', 'DSN_wavfile', 'Eseismic',
       'Fs', 'R', 'bandratio_[0.8_4.0_16.0]', 'bandratio_[1.0_6.0_11.0]',
       'bw_max', 'bw_min', 'calib', 'corrected_ASN_mseed',
       'corrected_DSN_mseed', 'day', 'elev', 'energy', 'filetime', 'hour',
       'kurtosis', 'lat', 'lon', 'magA', 'magE', 'medianF', 'minute', 'month',
       'noise_level', 'num_gaps', 'num_traces', 'path', 'peakA', 'peakF',
       'peakamp', 'peaktime', 'percent_availability', 'quality',
       'sample_lower_quartile', 'sample_max', 'sample_mean', 'sample_median',
       'sample_min', 'sample_rms', 'sample_stdev', 'sample_upper_quartile',
       'second', 'sfile', 'signal_level', 'skewness', 'snr', 'subclass',
       'twin', 'year', 'D', 'r', 'e', 'l', 'h', 't', 'new_subclass', 'weight',
       'checked', 'split', 'delete', 'ignore'],
      dtype='object')
                        path                DSN_wavfile
0  9801-01-0002-04S.MVO_18_1  9801-01

# 2. Remove any events without a miniseed file

In [4]:
miniseed_cat = dfall2[dfall2['corrected_DSN_mseed'].isnull()==False]
summarize_df(miniseed_cat)
N_miniseed_cat = number_of_checked_events(miniseed_cat)

Events: 209135. Checked: 0 
Index(['ASN_exists', 'ASN_wavfile', 'DSN_exists', 'DSN_wavfile', 'Eseismic',
       'Fs', 'R', 'bandratio_[0.8_4.0_16.0]', 'bandratio_[1.0_6.0_11.0]',
       'bw_max', 'bw_min', 'calib', 'corrected_ASN_mseed',
       'corrected_DSN_mseed', 'day', 'elev', 'energy', 'filetime', 'hour',
       'kurtosis', 'lat', 'lon', 'magA', 'magE', 'medianF', 'minute', 'month',
       'noise_level', 'num_gaps', 'num_traces', 'path', 'peakA', 'peakF',
       'peakamp', 'peaktime', 'percent_availability', 'quality',
       'sample_lower_quartile', 'sample_max', 'sample_mean', 'sample_median',
       'sample_min', 'sample_rms', 'sample_stdev', 'sample_upper_quartile',
       'second', 'sfile', 'signal_level', 'skewness', 'snr', 'subclass',
       'twin', 'year', 'D', 'r', 'e', 'l', 'h', 't', 'new_subclass', 'weight',
       'checked', 'split', 'delete', 'ignore'],
      dtype='object')
                        path                DSN_wavfile
0  9801-01-0002-04S.MVO_18_1  9801-01

# 3. Examine the catalog processed in July

In [5]:
some_checked_csv = "../CSVfiles/catalog_unique.csv"
some_checked_cat = pd.read_csv(some_checked_csv)
checked_cat = some_checked_cat[some_checked_cat['checked']==True]
checked_cat = checked_cat[['path', 'quality','subclass','D', 'R',
       'r', 'e', 'l', 'h', 't', 'new_subclass', 'weight', 'checked', 'split',
       'delete', 'ignore']]
#checked_cat.rename(columns={'path':'DSN_wavfile'}, inplace=True)
N_checked_cat = number_of_checked_events(checked_cat)
print(sorted(checked_cat.columns))

['D', 'R', 'checked', 'delete', 'e', 'h', 'ignore', 'l', 'new_subclass', 'path', 'quality', 'r', 'split', 'subclass', 't', 'weight']


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# 4. Merge the catalogs - had to write my own code

In [6]:
print('%s has %d checked events' % (master_event_catalog_original, N_miniseed_cat) )
print('%s has %d checked events' % (some_checked_csv, N_checked_cat) )

# RESET INDEXES
set_index_to_path = True
if set_index_to_path:
    if 'path' in miniseed_cat.columns:
        miniseed_cat.set_index('path', inplace=True)
        #miniseed_cat.sort_index(inplace=True)
    if 'path' in checked_cat.columns:
        checked_cat.set_index('path', inplace=True)
        #checked_cat.sort_index(inplace=True)

def mergeGT(miniseed, checked):
    frames = []
    for i2, row in checked.iterrows():
        i2base = os.path.basename(i2)
        subset_df = miniseed[miniseed.index == i2base]
        if len(subset_df.index)==1:
            for col in checked.columns:
                subset_df.loc[i2base, col] = row[col]
            frames.append(subset_df)   
    newdf = pd.concat(frames)
    newdf['path'] = newdf.index
    combineddf = pd.concat([miniseed, newdf])
    combineddf['path'] = combineddf.index
    mergeddf = combineddf.drop_duplicates(subset=['path'], keep='last')
    return mergeddf
merged_cat = mergeGT(miniseed_cat, checked_cat)

print(merged_cat.columns)
summarize_df(merged_cat)
N_merged_cat = number_of_checked_events(merged_cat)

/Users/thompsong/DATA/MVO/MachineLearning/MVOE_/original/MVOE_11_master_catalog_rebuilt.csv has 0 checked events
../CSVfiles/catalog_unique.csv has 584 checked events


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Index(['ASN_exists', 'ASN_wavfile', 'DSN_exists', 'DSN_wavfile', 'Eseismic',
       'Fs', 'R', 'bandratio_[0.8_4.0_16.0]', 'bandratio_[1.0_6.0_11.0]',
       'bw_max', 'bw_min', 'calib', 'corrected_ASN_mseed',
       'corrected_DSN_mseed', 'day', 'elev', 'energy', 'filetime', 'hour',
       'kurtosis', 'lat', 'lon', 'magA', 'magE', 'medianF', 'minute', 'month',
       'noise_level', 'num_gaps', 'num_traces', 'peakA', 'peakF', 'peakamp',
       'peaktime', 'percent_availability', 'quality', 'sample_lower_quartile',
       'sample_max', 'sample_mean', 'sample_median', 'sample_min',
       'sample_rms', 'sample_stdev', 'sample_upper_quartile', 'second',
       'sfile', 'signal_level', 'skewness', 'snr', 'subclass', 'twin', 'year',
       'D', 'r', 'e', 'l', 'h', 't', 'new_subclass', 'weight', 'checked',
       'split', 'delete', 'ignore', 'path'],
      dtype='object')
Events: 209092. Checked: 584 
Index(['ASN_exists', 'ASN_wavfile', 'DSN_exists', 'DSN_wavfile', 'Eseismic',
       'Fs', '

In [11]:
master_event_catalog = '/Users/thompsong/DATA/MVO/MachineLearning/MVOE_/labelling/11_merged_catalog.csv'
df2csv_without_index(merged_cat, master_event_catalog)
_df2file_without_index(merged_cat, master_event_catalog.replace('.csv','.pkl'), indexcol=None)

Index(['ASN_exists', 'ASN_wavfile', 'DSN_exists', 'DSN_wavfile', 'Eseismic',
       'Fs', 'R', 'bandratio_[0.8_4.0_16.0]', 'bandratio_[1.0_6.0_11.0]',
       'bw_max', 'bw_min', 'calib', 'corrected_ASN_mseed',
       'corrected_DSN_mseed', 'day', 'elev', 'energy', 'filetime', 'hour',
       'kurtosis', 'lat', 'lon', 'magA', 'magE', 'medianF', 'minute', 'month',
       'noise_level', 'num_gaps', 'num_traces', 'peakA', 'peakF', 'peakamp',
       'peaktime', 'percent_availability', 'quality', 'sample_lower_quartile',
       'sample_max', 'sample_mean', 'sample_median', 'sample_min',
       'sample_rms', 'sample_stdev', 'sample_upper_quartile', 'second',
       'sfile', 'signal_level', 'skewness', 'snr', 'subclass', 'twin', 'year',
       'D', 'r', 'e', 'l', 'h', 't', 'new_subclass', 'weight', 'checked',
       'split', 'delete', 'ignore', 'path'],
      dtype='object')
