# Program Overview
This program is in two parts. 
1. The first section creates an inventory of the TOB3 files as a CSV file in the code directory.
2. The second section converts each individual TOB3 file to a CSV file, and creates the following output files:

   (i) data_inventory.csv: an inventory of every file processed. This has fields: 
        filename, 
        topdir, 
        uploadfolder, 
        sampratefolder, 
        basename, 
        samprate, 
        seqno, 
        starttime, 
        endtime, 
        dropped_headers (bool), 
        dropped_rows (integer), 
        calibrated (bool, always False here), RECORD (integer), 
    followed by a list of columns found in the collection of TOB3 files with the median value of each column
    NOTE: this could be used to make 4-hourly median plots of any column!!
    
   (ii) errorfile.csv: an inventory of every file not processed, or partially processed but some columns not. This has fields:
        file,
        error,
        processed (bool)


# Headers

In [1]:
import header
paths = header.setup_environment()
for k,v in paths.items():
    print(k, '->', v)
    

Linux
HOME -> /home/thompsong
Developer -> /home/thompsong/Developer
repodir -> /home/thompsong/Developer/KSCRocketSeismoHydrology/Python/new_workflow
work -> /home/thompsong/work
local_outdir -> /home/thompsong/work/PROJECTS/KSC_EROSION
DROPBOX_TOP -> /home/thompsong/Dropbox
new_data -> /data/KSC/EROSION/fromdropboxinventory
DROPBOX_DATA_TOP -> /home/thompsong/Dropbox/PROFESSIONAL/RESEARCH/3_Project_Documents/NASAprojects/201602_Rocket_Seismology/DATA/2022_DATA
dropbox_outdir -> /home/thompsong/Dropbox/PROFESSIONAL/RESEARCH/3_Project_Documents/NASAprojects/201602_Rocket_Seismology/DATA/2022_DATA/new_workflow
WELLDATA_TOP -> /home/thompsong/Dropbox/PROFESSIONAL/RESEARCH/3_Project_Documents/NASAprojects/201602_Rocket_Seismology/DATA/2022_DATA/WellData
TOB3_DIR -> /home/thompsong/Dropbox/PROFESSIONAL/RESEARCH/3_Project_Documents/NASAprojects/201602_Rocket_Seismology/DATA/2022_DATA/WellData/Uploads
transducersCSVfile -> /home/thompsong/Developer/KSCRocketSeismoHydrology/Python/new_workflo

## 1.1 Functions

In [2]:
def process_tob3file(tob3file):
    print(tob3file)
    startdatetime=None
    with open(tob3file, 'rb') as f:
        firstline = f.readline().decode()
    #print(firstline)
    fields = firstline.split(',')
    for field in fields:
        if len(field)>5 and field[0:5]=='\"2022':
            startdatetime = field[1:-2].replace('\"', '')
    thisdict = {'file':os.path.basename(tob3file), 'datetime':startdatetime}
    return thisdict


## 1.2 Main program to inventorize TOB3 binary files

First just count them - this only gets those since 2022/07/21

In [3]:
import os
import glob
import pandas as pd

import libWellData as LLE

# Generate complete list of TOB3 files (raw TOB3 files from CS dataloggers)
tob3files = LLE.list_loggernet_tob3_files(paths['TOB3_DIR'])
print(f'Found {len(tob3files)} TOB3 files')

#for filenum, tob3file in enumerate(tob3files):


Found 1462 TOB3 files


This generates the inventory

In [4]:
OUTPUTDIR = os.path.join(paths['new_data'], '00_from_binary_files')
lod = []
tob3dir = paths['TOB3_DIR']
inventoryfile = os.path.join(OUTPUTDIR, 'inventory.csv')
for uploaddir in sorted(glob.glob(os.path.join(tob3dir, '202*'))):
    for dirorfile in sorted(glob.glob(os.path.join(uploaddir, '*'))):
        if os.path.isdir(dirorfile):
            sampratedir = dirorfile
            for tob3file in sorted(glob.glob(os.path.join(sampratedir, '*.dat'))):
                thisdict = process_tob3file(tob3file)
                thisdict['uploaddir']=os.path.basename(uploaddir)
                thisdict['sampratedir']=os.path.basename(sampratedir)
                lod.append(thisdict)

        elif dirorfile[-4:] == '.dat':
            sampratedir = ''
            tob3file = dirorfile
            thisdict = process_tob3file(tob3file)
            thisdict['uploaddir']=os.path.basename(uploaddir)
            thisdict['sampratedir']=os.path.basename(sampratedir)
            lod.append(thisdict)
  
df = pd.DataFrame(lod)
df.sort_values(by='datetime', inplace=True)
df.to_csv(inventoryfile, index=False)

/home/thompsong/Dropbox/PROFESSIONAL/RESEARCH/3_Project_Documents/NASAprojects/201602_Rocket_Seismology/DATA/2022_DATA/WellData/Uploads/20220330/HOF-IW0006I0.dat
/home/thompsong/Dropbox/PROFESSIONAL/RESEARCH/3_Project_Documents/NASAprojects/201602_Rocket_Seismology/DATA/2022_DATA/WellData/Uploads/20220330/HOF-IW0006I1.dat
/home/thompsong/Dropbox/PROFESSIONAL/RESEARCH/3_Project_Documents/NASAprojects/201602_Rocket_Seismology/DATA/2022_DATA/WellData/Uploads/20220330/HOF-IW0006I2.dat
/home/thompsong/Dropbox/PROFESSIONAL/RESEARCH/3_Project_Documents/NASAprojects/201602_Rocket_Seismology/DATA/2022_DATA/WellData/Uploads/20220330/HOF-IW0006I3.dat
/home/thompsong/Dropbox/PROFESSIONAL/RESEARCH/3_Project_Documents/NASAprojects/201602_Rocket_Seismology/DATA/2022_DATA/WellData/Uploads/20220330/HOF-IW0006I4.dat
/home/thompsong/Dropbox/PROFESSIONAL/RESEARCH/3_Project_Documents/NASAprojects/201602_Rocket_Seismology/DATA/2022_DATA/WellData/Uploads/20220330/HOF-IW0006I5.dat
/home/thompsong/Dropbox/PROF

# Section 2: Conversion of TOB3 binary files and inventorize their contents

## 2.1 Functions

In [5]:
from IPython.display import clear_output

def write_masterdf(allmasterrows):
    if len(allmasterrows)>0:
        masterdf = pd.DataFrame(allmasterrows)
        print(f'Writing/updating {MASTERCSV}')
        if 'starttime' in masterdf.columns:
            masterdf.sort_values(by=['starttime','uploadfolder','sampratefolder','samprate']).to_csv(MASTERCSV, index=False)
        else:
            masterdf.to_csv(MASTERCSV, index=False)


def process_file(df, tob3file, OUTPUTDIR, filenum, allcolumns):

    errormsg = ''
    masterrow = {}
    clear_output()
    os.system("clear")

    # Read TOB3 binary file
    print('- Reading TOB3file')
    data, meta = campbell.read_cs_files(tob3file, forcedatetime=False,
                    bycol=True, quiet=True, metaonly=False)
    print('- converting to dataframe')
    try: # adding because one time meta was just a bool and not subscriptable
        df = pd.DataFrame(columns=meta[2]) 
    except:
        errormsg = 'cannot turn meta into dataframe'
        return masterrow, errormsg
    for c in range(len(meta[2])):
        df[meta[2][c]] = data[c]
        if not meta[2][c] in allcolumns:
            allcolumns.append(meta[2][c])  

    # Process full file path to extract information
    print(f'Processing file {filenum}: {tob3file}')
    abspath, relpath = tob3file.split('Uploads/')
    pparts = relpath.split('/')
    uploadfolder = pparts[0]
    if len(pparts)>2:
        sampratefolder = pparts[1]
    else:
        sampratefolder = ''

    fparts = os.path.basename(tob3file).split('.')
    if len(fparts)==2:
        basefilename, ext = fparts
    else:
        errormsg = f'cannot split {fparts} into base and extension'
        return masterrow, errormsg
    
    try:
        sampratefolder1, middlename, realsamprateandseqno = basefilename.split('_')
    except:
        errormsg = f'failed to split basename {basefilename} into sampratefolder, middlename, realsamprateandseqno'
        return masterrow, errormsg
    
    if sampratefolder.lower() == sampratefolder1.lower(): # not a Baro file
        if 'Hz' in realsamprateandseqno:
            realsamprate, seqno = realsamprateandseqno.split('Hz')
        elif 'Sec' in realsamprateandseqno:
            realsamprate, seqno = realsamprateandseqno.split('Sec')
        else:
            errormsg = f'Did not find Hz or Sec in {realsamprateandseqno}'
            return masterrow, errormsg
        
    elif sampratefolder == 'Baro':
        realsamprate, baro, seqno = basefilename.split('_')
        realsamprate = realsamprate.split('hz')[0]
        seqno = seqno.split('Sensors')[-1]
    else:
        errormsg = f'samprates do not match {sampratefolder}, {sampratefolder1}'
        return masterrow, errormsg
    

    # Create masterrow for dict
    masterrow={'filename':tob3file, 'topdir':abspath, 'uploadfolder':uploadfolder, 'sampratefolder':sampratefolder, \
               'basename':basefilename, 'samprate':realsamprate, 'seqno':seqno} 

    dropped_headers = False
        
    ''' Note that converted TOB3 files have multiple header lines and columns are read as dtype "object" because of mixed dtype. So we have to explicity convert them after removing excess header rows.
        The first header row is garbage. We want the second '''
    

    # Drop incorrect header row
    columns_old = []
    if not 'TIMESTAMP' in df.columns: # use 2nd row of file/0th row of dataframe for columns instead
        columns_old = df.columns
        df.columns = df.iloc[0]
        df=df[1:]
        dropped_headers=True
            
    # Convert TIMESTAMP
    df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'], format='ISO8601', errors='coerce')
    l1 = len(df)
    df = df.dropna(subset=['TIMESTAMP'])

    # filter by TIMESTAMP further
    TS_median = df['TIMESTAMP'].median()
    TS_start = TS_median - pd.Timedelta(hours=4)
    TS_end = TS_median + pd.Timedelta(hours=4)
    df = df[(df['TIMESTAMP'] > TS_start ) & (df['TIMESTAMP'] < TS_end)]
    l2 = len(df)
    dropped_rows=l1-l2
    masterrow['starttime'] = df['TIMESTAMP'].min()
    masterrow['endtime'] = df['TIMESTAMP'].max()
    masterrow['dropped_headers']=dropped_headers
    masterrow['dropped_rows']=dropped_rows

    # Drop empty columns
    df = df.dropna(axis=1, how='all')   

    # Convert columns 
    masterrow['calibrated'] = False
    for colnum, col in enumerate(df.columns):
        if col!='TIMESTAMP':
            if 'Unnamed' in col: 
                if dropped_headers: # if for TOB3 converted files, there is no column header on second row, we try the first row again DO NOT THINK THIS EVER HAPPENS
                    df.rename(columns=[col, columns_old[colnum]], inplace=True)
                    col = columns_old[colnum]
                else:
                    continue
                    # just seems to be an index that was saved into CSV file in corrected directory.    print('- Reverse calibration equations')
    
            try:
                df[col]=df[col].astype(float)
                if df[col].apply(float.is_integer).all():
                    df[col]=df[col].astype(int)
                    masterrow[col] = int(df[col].median())
                else:
                    masterrow[col] = df[col].median()
            except Exception as e:
                if not errormsg:
                    errormsg = f'Failed to convert column {col}'
                else:
                    errormsg += f', {col}'

    # where to save filtered DataFrame
    mybasename = masterrow['basename'][:masterrow['basename'].rfind(masterrow['seqno'])]
    outdir = os.path.join(OUTPUTDIR, masterrow['uploadfolder'], masterrow['sampratefolder'])
    outfile = mybasename + '_' + \
        masterrow['starttime'].strftime('%Y%m%d%H%M%S_') + \
        f"{int(masterrow['seqno']):03d}" + \
        '.csv'
    outfullpath = os.path.join(outdir, outfile)
    if not os.path.isdir(outdir):
        os.makedirs(outdir)
    while os.path.isfile(outfullpath):
        outfullpath = outfullpath.replace('.csv', 'x.csv')

    # write CSV file
    df.to_csv(outfullpath, index=False)
            
    return masterrow, errormsg



## 2.2 Main program to convert TOB3 files

In [6]:
# how to convert TOB3 files with CSI Python library
# Read Campbell Scientific TOB3 file
OUTPUTDIR = os.path.join(paths['new_data'], '00_from_binary_files')
if not os.path.isdir(OUTPUTDIR):
    os.makedirs(OUTPUTDIR)
errorfile = os.path.join(OUTPUTDIR, '00_errors.txt')
os.system(f"echo path, error, processed >> {errorfile}")
MASTERCSV = os.path.join(OUTPUTDIR, f'data_inventory.csv')
if os.path.isfile(MASTERCSV):
    allmasterrows = pd.read_csv(MASTERCSV).to_dict('records')
else:
    allmasterrows=[]
    
import libWellData as LLE
allcolumns = []
lod = []

import sys
sys.path.append(os.path.join(paths['repodir'], 'campbell'))
import read_cs_files as campbell  

# Generate complete list of TOB3 files (raw TOB3 files from CS dataloggers)
tob3files = LLE.list_loggernet_tob3_files(paths['TOB3_DIR'])

for filenum, tob3file in enumerate(tob3files):

    print('File %d of %d: %s' % ((filenum+1), len(tob3files), tob3file))

    try:
        masterrow, errormsg = process_file(df, tob3file, OUTPUTDIR, filenum, allcolumns)
    except Exception as e:
        os.system(f"echo {tob3file}, {str(e)}, {False} >> {errorfile}")                
        continue

    if masterrow:
        allmasterrows.append(masterrow)
        if filenum % 10 == 0:
            write_masterdf(allmasterrows)
    else:
        os.system(f"echo {tob3file}, {errormsg}, {True} >> {errorfile}")            

write_masterdf(allmasterrows)                

[H[2J- Reading TOB3file
- converting to dataframe
Processing file 1461: /home/thompsong/Dropbox/PROFESSIONAL/RESEARCH/3_Project_Documents/NASAprojects/201602_Rocket_Seismology/DATA/2022_DATA/WellData/Uploads/20221202/20hz/20hz_Sensors_20Hz99.dat
Writing/updating /data/KSC/EROSION/fromdropboxinventory/00_from_binary_files/data_inventory.csv
