In [30]:
# count Sfiles by year/month
import os
import pandas as pd
SEISAN_TOP = os.getenv('SEISAN_TOP')
# Uncomment following line to use local archive on hal, rather than newton
SEISAN_TOP = '/media/sdd1/backups/seismo'
DB = 'MVOE_'
print('Reading from: ',SEISAN_TOP)
import glob

Reading from:  /media/sdd1/backups/seismo


In [33]:
yeardirs = glob.glob(os.path.join(SEISAN_TOP, 'REA', DB, '[12]???'))
rows = []
for yeardir in sorted(yeardirs):
    yearmonthdirs = glob.glob(os.path.join(yeardir, '[01][0-9]'))
    for yearmonthdir in sorted(yearmonthdirs):
        sfiles = glob.glob(os.path.join(yearmonthdir, '*[LRD].S*'))
        thisrow = { 'year':yeardir[-4:],'month':yearmonthdir[-2:], 'sfiles':len(sfiles) } 
        rows.append(thisrow)
readf = pd.DataFrame(rows)
print(readf)

print(readf['sfiles'].sum())
readf.to_csv('count_seisan_files_REA.csv')

     year month  sfiles
0    1996    10     424
1    1996    11    2091
2    1996    12    5570
3    1997    01    1729
4    1997    02    2695
..    ...   ...     ...
142  2008    08     247
143  2008    09     125
144  2008    10      85
145  2008    11       0
146  2008    12       0

[147 rows x 3 columns]
235813


In [32]:
yeardirs = glob.glob(os.path.join(SEISAN_TOP, 'WAV', DB, '[12]???'))
rows = []
for yeardir in sorted(yeardirs):
    yearmonthdirs = glob.glob(os.path.join(yeardir, '[01][0-9]'))
    for yearmonthdir in sorted(yearmonthdirs):
        sfiles = glob.glob(os.path.join(yearmonthdir.replace('WAV','REA'), '*[LRD].S*'))
        dsnfiles = glob.glob(os.path.join(yearmonthdir, '*S.MVO*'))
        asnfiles = glob.glob(os.path.join(yearmonthdir, '*S.SPN*'))
        thisrow = { 'year':yeardir[-4:],
                   'month':yearmonthdir[-2:], 
                   'sfiles':len(sfiles),
                   'DSNfiles':len(dsnfiles),
                   'ASNfiles':len(asnfiles)
                  } 
        rows.append(thisrow)
df = pd.DataFrame(rows)
print(df)

print(df[['sfiles','DSNfiles','ASNfiles']].sum())
df.to_csv('count_seisan_files_WAV.csv')

     year month  sfiles  DSNfiles  ASNfiles
0    1996    10     424       483         0
1    1996    11    2091      2425         0
2    1996    12    5570      5549         0
3    1997    01    1729      1736         0
4    1997    02    2695      2749         0
..    ...   ...     ...       ...       ...
138  2008    04     117       116         0
139  2008    05     153       153         0
140  2008    06     139       139         0
141  2008    07    1961      1963         0
142  2008    08     247       247         0

[143 rows x 5 columns]
sfiles      235603
DSNfiles    217861
ASNfiles     38664
dtype: int64


In [34]:
# Let's now read in reawav_MVOE_all.csv and see if there are duplicate S-files or WAV-files still
reawavdf = pd.read_csv('~/DATA/MVO/reawav_MVOE_all.csv')
print(reawavdf.columns)

Index(['Unnamed: 0', 'Fs', 'bandratio_[0.8_4.0_16.0]',
       'bandratio_[1.0_6.0_11.0]', 'bw_max', 'bw_min', 'calib',
       'cft_peak_wmean', 'cft_std_wmean', 'coincidence_sum', 'day',
       'detection_quality', 'energy', 'filetime', 'hour', 'kurtosis',
       'mainclass', 'medianF', 'minute', 'month', 'noise_level', 'num_gaps',
       'num_traces', 'offtime', 'ontime', 'path', 'peakA', 'peakF', 'peakamp',
       'peaktime', 'percent_availability', 'quality', 'sample_lower_quartile',
       'sample_max', 'sample_mean', 'sample_median', 'sample_min',
       'sample_rms', 'sample_stdev', 'sample_upper_quartile', 'second',
       'sfile', 'signal_level', 'skewness', 'snr', 'subclass',
       'trigger_duration', 'year'],
      dtype='object')


In [36]:
len(reawavdf.sfile.unique())

208229

In [37]:
len(reawavdf.filetime.unique())

208186

In [38]:
len(reawavdf.path.unique())

208187

In [40]:
df2 = reawavdf.sort_values('filetime')

In [43]:
df2.drop_duplicates()
print(len(df2.index))

400977


In [56]:
# so with vi, i see that there are many almost duplicate rows except for rounding errors
# let's drop duplicates based on just sfile and path (path is wavfile)
df3 = df2.drop_duplicates(
    subset=['sfile','path'],
    keep = 'last')
print(len(df3))

208230


In [72]:
# now examine any rows of df3 with non-unique path
dups = df3[df3.duplicated(subset=['path'])]

for thispath in dups['path']:
    df4 = df3[df3['path']==thispath]
    wavbase = os.path.basename(thispath)
    df4.to_csv('%s.csv' % wavbase)