Through some bugs in 00_convert_seisandb_to_csvfiles.py, I ended up with many duplicate lines in the files like reawav_MVOE_YYYYMM.csv. 

This code sorts that out, and the new files are like REA_WAV_MVOE_YYYY_MM.csv

It also sorts out some problems with the Seisan database, e.g. 1-many mapping from WAV files to S-files

Glenn Thompson, 2021/10/27 on hal

In [2]:
# Remove duplicates from reawav_MVOE_YYYYMM.csv files
import pandas as pd
import os, glob
SEISAN_DATA = os.path.join( os.getenv('HOME'),'DATA','MVO')
SEISAN_DB = 'MVOE_'
csvfiles = sorted(glob.glob(os.path.join(SEISAN_DATA,'reawav_' + SEISAN_DB + '*.csv')))
frames = []
for csvfile in csvfiles:
    df = pd.read_csv(csvfile)
    frames.append(df) 
dfall = pd.concat(frames, sort=True)
dfall.drop_duplicates(inplace=True)
#dfall.set_index('filetime', inplace=True) # we will need this later to remerge
#dfall.sort_index(inplace=True)
allcsv = os.path.join(SEISAN_DATA, 'reawav_%sall.csv' % SEISAN_DB)


In [5]:
SEISAN_TOP = os.getenv('SEISAN_TOP')
SEISAN_TOP = '/media/sdd1/backups/seismo' # Uncomment to use local archive on hal, rather than newton

print('Length of DataFrame is %d ' % len(dfall))

# with vi, i see that there are many almost duplicate rows except for rounding errors
# let's drop duplicates based on just sfile and path (path is wavfile)
reawavdf2 = dfall.drop_duplicates(
    subset=['sfile','path'],
    keep = 'last')
print('Length after drop_duplicates is %d ' % len(reawavdf2))

# Only rows left now will have unique sfile-path combinations

# Now we want to deal with events that have multiple S-files, mainly because there are DSN (MVO) and ASN (SPN) WAV files,
# with different start times
# Find rows of reawavdf2 with a non-unique (WAVfile) path
duplicate_rows = reawavdf2[reawavdf2.duplicated(subset=['path'])]
incorrect_sfiles = []
for i,thispath in enumerate(duplicate_rows['path']):
    sameWAVdf = reawavdf2[reawavdf2['path']==thispath]

    # Find correct Sfile
    WAV_DHMS = []
    print('\n\nMatch %d' %i)
    for i2,row in sameWAVdf.iterrows():
        print('SFILE=',row['sfile'])
        wavpath = os.path.join(SEISAN_TOP, row['path'])
        readir = os.path.join(SEISAN_TOP, os.path.dirname(row['path']).replace('WAV','REA'))
        sfilepath = os.path.join(readir, row['sfile'])
        with open(sfilepath) as f:
            lines = f.readlines()
            for line in lines:
                line = line.strip()
                if len(line)>0:
                    if line[-1]=='6':
                        if line[0]=='2' or line[0]=='1':
                            WAV_DHMS.append(line[8:18])
                        elif line[0]=='9':
                            WAV_DHMS.append(line[5:15])
    WAV_DHMS=sorted(WAV_DHMS)
    print('WAV_DHMS = %s' % WAV_DHMS[0])
    for i3,row in sameWAVdf.iterrows():    
        if WAV_DHMS[0] in row['sfile'] and not '-temp' in row['sfile']:
            print('Correct SFILE is %s' % row['sfile'])
        else:
            incorrect_sfiles.append(row['sfile'])
    
# Remove rows matching incorrect_sfiles from reawavdf2  
dfall = reawavdf2[~reawavdf2['sfile'].isin(incorrect_sfiles)]
print('Length after dropping bad sfiles is %d ' % len(dfall))
dfall.to_csv(allcsv)

Length of DataFrame is 208189 
Length after drop_duplicates is 208189 


Match 0
SFILE= 08-2136-38L.S200604-temp
SFILE= 08-2136-38L.S200604
WAV_DHMS = 08-2136-38
Correct SFILE is 08-2136-38L.S200604


Match 1
SFILE= 29-1227-26L.S200604-temp
SFILE= 29-1227-26L.S200604
WAV_DHMS = 29-1227-26
Correct SFILE is 29-1227-26L.S200604
Length after dropping bad sfiles is 208187 


In [10]:
years = list(set(sorted(dfall.year)))
for thisyear in years:
    dfyear = dfall[dfall.year == thisyear]
    months = list(set(sorted(dfyear.month)))
    print(thisyear, months)
    for thismonth in months:
        dfyearmonth = dfyear[dfyear.month == thismonth]
        yearmonthcsv = os.path.join(SEISAN_DATA, 'reawav_%s%4d%02d.csv' % (SEISAN_DB, thisyear, thismonth))
        dfyearmonth.to_csv(yearmonthcsv)        

1996 [10, 11, 12]
1997 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
1901 [3]
1998 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
1999 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2000 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2001 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2002 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2003 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2004 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2005 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2006 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2007 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2008 [1, 2, 3, 4, 5, 6, 7, 8]


In [17]:
# count events of each mainclass / subclass

allcsv = os.path.join(SEISAN_DATA, 'reawav_%sall.csv' % SEISAN_DB)
dfall = pd.read_csv(allcsv)
print(len(dfall))

from collections import Counter
counter_of_classes = Counter(dfall['mainclass'])
print(counter_of_classes.most_common())

counter_of_subclasses = Counter(dfall['subclass'])
print(counter_of_subclasses.most_common())

208187
[('LV', 208038), ('R', 116), ('L', 22), ('D', 10), ('LD', 1)]
[('r', 118287), ('h', 40355), ('l', 22314), ('e', 12269), ('t', 6779), ('n', 5861), ('g', 1494), ('u', 305), ('hl', 178), (nan, 132), ('s', 86), ('m', 47), ('3', 28), ('U', 14), ('2', 12), ('x', 11), ('M', 8), ('1', 2), ('rf', 2), ('V', 1), ('h*', 1), ('swarm', 1)]


In [34]:
# count events of each number of traces
counter_of_traces = Counter(dfall['num_traces'])
print(counter_of_traces.most_common())


# Compare compression of Seisan vs Miniseed vs Pickle files for same data
import obspy.core
dfmaxtraces = dfall[dfall['num_traces']==36]
#print(dfmaxtraces)
print('WAV, Miniseed/Seisan, Pickle/Seisan')
for i,row in dfmaxtraces.iterrows():
    WAVfile = os.path.join(SEISAN_DATA, row['path'])
    WAVfile_size = os.path.getsize(WAVfile)
    st = obspy.core.read(WAVfile)
    WAVbase = os.path.basename(WAVfile)
    MSEEDfile = WAVbase + '.mseed'
    st.write(MSEEDfile)
    MSEEDfile_size = os.path.getsize(MSEEDfile)
    PICKLEfile = WAVbase + '.pickle'
    st.write(PICKLEfile)
    PICKLEfile_size = os.path.getsize(PICKLEfile)
    print('%s, %.2f, %.2f' % (WAVbase, MSEEDfile_size/WAVfile_size,PICKLEfile_size/WAVfile_size))


[(14, 25226), (13, 23352), (1, 16855), (18, 15791), (17, 13046), (19, 11770), (12, 11113), (24, 7697), (16, 7616), (21, 7554), (10, 7477), (9, 7117), (25, 6698), (11, 6486), (20, 5495), (15, 4734), (22, 3484), (23, 3061), (30, 2783), (27, 2638), (31, 2567), (8, 2241), (7, 2241), (5, 2044), (2, 1952), (28, 1950), (3, 1583), (6, 1091), (26, 1083), (29, 652), (4, 467), (32, 121), (34, 80), (35, 79), (36, 26), (33, 17)]
        Unnamed: 0     Fs  Unnamed: 0.1  bandratio_[0.8_4.0_16.0]  \
170140      326853  100.0       11737.0                 -1.563102   
170142      326855  100.0       11739.0                  0.155921   
170145      326858  100.0       11742.0                 -0.393558   
170146      326859  100.0       11743.0                 -0.617152   
170161      326874  100.0       11758.0                 -1.452944   
170162      326875  100.0       11759.0                 -0.152660   
170167      326880  100.0       11764.0                 -1.203400   
170168      326881  100.0   

2005-09-22-2345-34S.MVO___036, 0.35, 1.63
2005-09-23-0127-33S.MVO___036, 0.33, 1.63
2005-09-23-0146-11S.MVO___036, 0.34, 1.64
2005-09-23-1523-51S.MVO___036, 0.31, 1.61
2005-09-23-1549-22S.MVO___036, 0.34, 1.61
2005-09-24-0348-30S.MVO___036, 0.35, 1.62
2005-09-24-0840-47S.MVO___036, 0.34, 1.62
2005-09-24-1224-28S.MVO___036, 0.36, 1.60
2005-09-24-1254-00S.MVO___036, 0.33, 1.59
2005-09-24-1557-33S.MVO___036, 0.38, 1.59
2005-09-25-1617-48S.MVO___036, 0.35, 1.58
2005-09-25-1641-27S.MVO___036, 0.34, 1.59
2005-09-27-2303-39S.MVO___036, 0.34, 1.63
2005-09-28-0940-42S.MVO___036, 0.35, 1.65
2005-09-30-0152-16S.MVO___036, 0.35, 1.65
2005-09-30-1103-40S.MVO___036, 0.34, 1.64
2005-11-25-1537-37S.MVO___037, 0.39, 1.63
2005-11-25-1604-06S.MVO___037, 0.41, 1.61
