In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import glob
import matplotlib.dates as mdates
import numpy as np
from datetime import timedelta
from tqdm import tqdm
import datetime as dt
from datetime import time
import math
from datetime import datetime

In [2]:
def reader(path, first_var):
    '''
    Reads SEMS/DASH data, adds datetime columns

    :param path: path to data file
    :param first_var: the name of the fir column label
    :return: pandas DataFrame
    '''
    # Open the file and read the lines
    skip=1
    with open(path, "r") as file:
        # Iterate over the lines
        for line in file:
            # rip leading and trailing whitespace
            line = line.strip()
            # Check if the line contains column names
            if line.startswith(first_var):
                # Split the line by whitespace and append to the columns list
                columns = line[1:].strip().split("\t")
                break
            skip+=1
    # Read the data into a DataFrame, skipping the first 6 rows of comments
    d = pd.read_csv(path, sep='\t', skiprows=skip, names=columns, low_memory=False)
    #Check for duplicated metadata, remove metadata rows based on string "OPC SN"
    if len(d)>0:
        if isinstance(d.iloc[0,0], str):
            dup_meta = [n for n, i in enumerate(d.iloc[:,0]) if 'OPC SN' in i]
            if len(dup_meta) > 0:
                for line in dup_meta:
                    #Deletes duplicate metadata rows from dataframe
                    d.drop(np.arange(line,line+68), inplace = True)
                    # Apply the function to each column
                    d_og = d.copy()
                    for c in d.keys():
                        try:
                            d[c] = pd.to_numeric(d_og[c])
                        except:
                            d[c] = d_og[c]
                    #d = d.apply(pd.to_numeric, ‘raise’,)
                    
    # Creates datetime columns
    if 'DOY.Frac' in d.keys():
        d['dt'] = pd.to_datetime('2024-1-1') + pd.to_timedelta(d['DOY.Frac'], unit='D') - pd.Timedelta(days=1)
    if 'StartTimeSt' in d.keys():
        d['st_dt'] = pd.to_datetime('2024-1-1') + pd.to_timedelta(d['StartTimeSt'], unit='D') - pd.Timedelta(days=1)
    if 'EndTimeSt' in d.keys():
        d['end_dt'] = pd.to_datetime('2024-1-1') + pd.to_timedelta(d['EndTimeSt'], unit='D') - pd.Timedelta(days=1)
    if 'YY/MM/DD' and 'HR:MN:SC' in d.keys():
        d['dt'] = pd.to_datetime(str(20) + d['YY/MM/DD'] + ' ' + d['HR:MN:SC'], format='%Y/%m/%d %H:%M:%S')
    return d

In [3]:
def glob_reader(file_key, first_var, subfolder = './data/'):
    '''
    Reads groups of data files and merges them into one

    :param file_key: shared key in filenames
    :param first_var: the name of the first column label
    :param subfolder: name of the subfolder containing the data
    :return: pandas DataFrame
    '''
    paths = sorted(glob.glob(subfolder+'*'+file_key+'*'))
    d = []
    for i in range(0, len(paths)):
        f = reader(paths[i], first_var)
        if len(f)>0:
            d.append(f)
    d = pd.concat(d).reset_index()
    return d

In [10]:
def read_all_data():

    folders = sorted(glob.glob('./data/DASH-flight*'))

    dopc = []
    hopc = []
    dash = []
    sems = []

    for i in range(0, len(folders)):
        path = folders[i] + '/'
        dopc.append(glob_reader('OPC_212', '#YY/MM/DD', subfolder = path).drop_duplicates(subset='dt', keep='first'))
        hopc.append(glob_reader('OPC_210', '#YY/MM/DD', subfolder = path).drop_duplicates(subset='dt', keep='first'))
        dash.append(glob_reader('DASH_FLOW', '#DOY.Frac', subfolder = path).drop_duplicates(subset='dt', keep='first'))
        sems.append(glob_reader('SEMS_DATA', '#DOY.Frac', subfolder = path).drop_duplicates(subset='dt', keep='first'))
    
    dopc = pd.concat(dopc, ignore_index=True)
    hopc = pd.concat(hopc, ignore_index=True)
    dash = pd.concat(dash, ignore_index=True)
    sems = pd.concat(sems, ignore_index=True)
    
    # merge the DataFrames based on the DOPC times
    merged = pd.merge_asof(dopc, hopc, on='dt', direction = 'nearest', tolerance=timedelta(seconds=1)).drop(columns=['index_x'])
    merged = pd.merge_asof(merged, dash, on='dt', direction = 'nearest', tolerance=timedelta(seconds=1))
    merged = pd.merge_asof(merged, sems, on='dt', direction = 'nearest', tolerance=timedelta(seconds=1))

    return merged

In [140]:
def retrieve_flows(d):

    # filter out for flight times
    d_out = pd.DataFrame()
    for _, row in ft.iterrows():
        mask = (d['dt'] >= row['LARGE_Filter_Off_UTC']) & (d['dt'] <= row['LARGE_Filter_On_UTC'])
        filtered = d[mask].copy()
        d_out = pd.concat([d_out, filtered], ignore_index=True)
    print(len(d_out))
    d_out['input_flow_lpm'] = d_out['UpSt_Samp']

    for i, row in ls.iterrows():
        if row['LARGE_To'] == 'OFF':
            mask = (d_out['dt'] >= row['Switch_Start_UTC']) & (d_out['dt'] <= ls.loc[i+1,'Switch_Stop_UTC'])
            d_out.loc[mask,'input_flow_lpm'] = 0
        elif row['LARGE_To'] == 'DASH':
            mask = (d_out['dt'] >= row['Switch_Start_UTC']) & (d_out['dt'] <= row['Switch_Stop_UTC'])
            d_out.loc[mask,'input_flow_lpm'] = 0
            mask = (d_out['dt'] >= row['Switch_Stop_UTC']) & (d_out['dt'] <= ls.loc[i+1,'Switch_Stop_UTC'])
            d_out.loc[mask,'input_flow_lpm'] = d_out.loc[mask,'HM_Smp_Xs']
        elif row['LARGE_To'] == 'SEMS':
            mask = (d_out['dt'] >= row['Switch_Start_UTC']) & (d_out['dt'] <= row['Switch_Stop_UTC'])
            d_out.loc[mask,'input_flow_lpm'] = 0
    
    output = pd.DataFrame(data={'datetime_UTC':d_out['dt'], 'input_flow_lpm':d_out['input_flow_lpm']})

    return output, d_out

In [105]:
d['HM_Smp_Xs'].min()

-0.04

In [142]:
len(d)

214752

In [141]:
output, dout = retrieve_flows(d)

507466


In [18]:
ft = pd.read_csv('./meta/ARCSIX_takeoff_landing_times.txt')

In [65]:
ls = pd.read_csv('./meta/ARCSIX_DASH_SEMS_switch_times.txt', parse_dates=['Switch_Start_UTC', 'Switch_Stop_UTC'])

In [136]:
d = read_all_data()

In [139]:
len(d) - len(d.drop_duplicates())

0

In [17]:
d['UpSt_Samp'], d['dt']

(0        -0.00
 1        -0.00
 2        -0.00
 3        -0.00
 4        -0.00
           ... 
 214747    1.78
 214748    1.78
 214749    1.78
 214750    1.78
 214751    1.42
 Name: UpSt_Samp, Length: 214752, dtype: float64,
 0        2024-05-28 08:50:21
 1        2024-05-28 08:50:22
 2        2024-05-28 08:50:23
 3        2024-05-28 08:50:24
 4        2024-05-28 08:50:26
                  ...        
 214747   2024-06-10 16:04:59
 214748   2024-06-10 16:05:00
 214749   2024-06-10 16:05:01
 214750   2024-06-10 16:05:02
 214751   2024-06-10 16:05:03
 Name: dt, Length: 214752, dtype: datetime64[ns])

In [16]:
list(d.keys())

['YY/MM/DD_x',
 'HR:MN:SC_x',
 'samp_intrvl_x',
 'total_conc_x',
 'sample_flw_x',
 'sheath_flw_x',
 'sheath_temp_x',
 'samp_press_x',
 'lasr_brt_x',
 'lasr_cur_x',
 'pmt_base_rd_x',
 'pmt_base_pot_adj_x',
 'sheath_pwr_x',
 'exit_pwr_x',
 'opc_errs_x',
 'bin1_x',
 'bin2_x',
 'bin3_x',
 'bin4_x',
 'bin5_x',
 'bin6_x',
 'bin7_x',
 'bin8_x',
 'bin9_x',
 'bin10_x',
 'bin11_x',
 'bin12_x',
 'bin13_x',
 'bin14_x',
 'bin15_x',
 'bin16_x',
 'bin17_x',
 'bin18_x',
 'bin19_x',
 'bin20_x',
 'bin21_x',
 'bin22_x',
 'bin23_x',
 'bin24_x',
 'bin25_x',
 'bin26_x',
 'bin27_x',
 'bin28_x',
 'bin29_x',
 'bin30_x',
 'bin31_x',
 'bin32_x',
 'bin33_x',
 'bin34_x',
 'bin35_x',
 'bin36_x',
 'bin37_x',
 'bin38_x',
 'bin39_x',
 'bin40_x',
 'bin41_x',
 'bin42_x',
 'bin43_x',
 'bin44_x',
 'bin45_x',
 'bin46_x',
 'bin47_x',
 'bin48_x',
 'bin49_x',
 'bin50_x',
 'bin51_x',
 'bin52_x',
 'bin53_x',
 'bin54_x',
 'bin55_x',
 'bin56_x',
 'bin57_x',
 'bin58_x',
 'bin59_x',
 'bin60_x',
 'bin61_x',
 'bin62_x',
 'bin63_x',
 