In [38]:
import os
import glob
import pandas as pd
import obspy
from IPython.display import clear_output
NOW = obspy.UTCDateTime()
MASTERCSV = f'data_inventory_{NOW}.csv'
filesnotprocessedfile = 'files_not_processed.txt'

def write_masterdf(allmasterrows):
    if len(allmasterrows)>0:
        masterdf = pd.DataFrame(allmasterrows)
        print(f'Writing/updating {MASTERCSV}')
        if 'TIMESTAMP_max' in masterdf.columns:
            masterdf.sort_values(by=['uploadfolder','sampratefolder','samprate', 'TIMESTAMP']).to_csv(MASTERCSV, index=False)
        else:
            masterdf.to_csv(MASTERCSV, index=False)

def process_file(dirpath, file, filenum, load=False):
    clear_output()
    print(f'Processing file {filenum}: {file} in {dirpath}')
    fparts = file.split('.')
    if len(fparts)==4:
        uploadfolder, sampratefolder, basefilename, ext = fparts
    elif len(fparts)==2:
        uploadfolder = 'unknown'
        sampratefolder = 'unknown'
        basefilename, ext = fparts
        dparts = dirpath.split('/')
        if len(dparts)>3:
            uploadfolder = dparts[-2]
            sampratefolder = dparts[-1]
    print(uploadfolder, sampratefolder, basefilename, ext)
    try:
        sampratefolder1, middlename, realsamprateandseqno = basefilename.split('_')
    except:
        print('Failed to split: ',basefilename)
        return 'failed to split basename'
    if sampratefolder.lower() == sampratefolder1.lower(): # not a Baro file
        if 'Hz' in realsamprateandseqno:
            realsamprate, seqno = realsamprateandseqno.split('Hz')
        elif 'Sec' in realsamprateandseqno:
            realsamprate, seqno = realsamprateandseqno.split('Sec')
        else:
            print(f'Did not find Hz or Sec in filename: {file}')
            return 'Did not find Hz or Sec in filename'
    elif sampratefolder == 'Baro':
        realsamprate, baro, seqno = basefilename.split('_')
        realsamprate = realsamprate.split('hz')[0]
        seqno = seqno.split('Sensors')[-1]
    else:
        print(f'samprate do not match: {file}, {sampratefolder}, {sampratefolder1}' +'\n')
        return 'samprate do not match'
    print(realsamprate, seqno)
    #masterrow={'filename':os.path.basename(file), 'topdir':dirpath, 'uploadfolder':os.path.basename(uploadfolder), 'sampratefolder':sampratefolder, \
    #           'basename':basefilename, 'samprate':realsamprate, 'seqno':seqno} 
    masterrow={'filename':file, 'topdir':dirpath, 'uploadfolder':uploadfolder, 'sampratefolder':sampratefolder, \
               'basename':basefilename, 'samprate':realsamprate, 'seqno':seqno} 


    if load:
        dropped_headers = False
        dropped_rows = 0
        fullpath = os.path.join(dirpath, file)
        print(f'Loading {fullpath}')
        try:
            if file.endswith('pkl'):
                df = pd.read_pickle(fullpath)
            elif file.endswith('csv'):
                df = pd.read_csv(fullpath)
        except Exception as e:
            print(e)
            os.system(f'head {fullpath}')
            raise e

        # Drop incorrect header row
        if not 'TIMESTAMP' in df.columns: # use 2nd row of file/0th row of dataframe for columns instead
            df.columns = df.iloc[0]
            df=df[1:]
            dropped_headers=True
            
        # Convert TIMESTAMP
        df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'], format='ISO8601', errors='coerce')
        l1 = len(df)
        df = df.dropna(subset=['TIMESTAMP'])

        # filter by TIMESTAMP further
        TS_median = df['TIMESTAMP'].median()
        TS_start = TS_median - pd.Timedelta(hours=4)
        TS_end = TS_median + pd.Timedelta(hours=4)
        df = df[(df['TIMESTAMP'] > TS_start ) & (df['TIMESTAMP'] < TS_end)]
        l2 = len(df)
        dropped_rows = l1-l2

        # Drop empty columns
        df = df.dropna(axis=1, how='all')    
        for col in df.columns:
            if col!='TIMESTAMP':
                if col=='RECORD':
                    df[col] = df[col].astype('int')
                    masterrow[col] = int(df[col].median())
                else:
                    df[col] = df[col].astype('float')
                    masterrow[col] = df[col].median()
        masterrow['dropped_headers']=dropped_headers
        masterrow['dropped_rows']=dropped_rows
    return masterrow
  
try:
    pwd = os.getcwd()
    os.chdir('~/Dropbox/PROFESSIONAL/RESEARCH/3_Project_Documents/NASA Projects/201602 Rocket Seismology/DATA/2022_DATA/WellData')
except:
    pass
if os.path.isfile(filesnotprocessedfile):
    os.unlink(filesnotprocessedfile)

masterdf=pd.DataFrame()
allmasterrows=[]
filenum = 0
for dirpath, dirnames, filenames in os.walk("."):
    if 'combined' in dirpath:
        continue
    for filename in filenames:
        if filename.endswith((".csv", '*.pkl')):
            masterrow = []
            if 'gps' in filename or 'master' in filename or 'lookuptable' in filename or 'transducer' in filename or 'HOF' in filename:
                masterrow = 'did not match file filter'
            else:
                if filename.startswith("._"):
                    filename2 = filename[2:]
                    if filename2 in filenames:
                        masterrow = 'starts with ._'
            if not masterrow:
                filenum += 1
                print(filenum, dirpath, filename)
                masterrow = process_file(dirpath, filename, filenum, load=True)
            if isinstance(masterrow, dict):
                allmasterrows.append(masterrow)
                if filenum % 100 == 0:
                    write_masterdf(allmasterrows)
            else:
                os.system(f"echo {os.path.join(dirpath, filename)}: {masterrow} >> files_not_processed.txt")
write_masterdf(allmasterrows)                
os.chdir(pwd)
        

Processing file 4458: 100hz_Sensors_100Hz419.csv in ./obsolete/WellData/Uploads/20221104/100hz
20221104 100hz 100hz_Sensors_100Hz419 csv
100 419
Loading ./obsolete/WellData/Uploads/20221104/100hz/100hz_Sensors_100Hz419.csv
2022-10-29 00:05:16.740000
460610583.0
9166.545
9486.397
9702.878
8625.848
8356.755
9771.862
Writing/updating masterdf_2024-09-04T17:13:12.649917Z.csv
