In [333]:
import pandas as pd
from matplotlib import pyplot as plt
import glob
import matplotlib.dates as mdates
import numpy as np
from datetime import timedelta
from tqdm import tqdm

Define Functions

In [334]:
def reader(path, first_var):
    '''
    Reads SEMS/DASH data, adds datetime columns

    :param path: path to data file
    :param first_var: the name of the first column label
    :return: pandas DataFrame
    '''
    # Open the file and read the lines
    skip=1
    with open(path, "r") as file:
        # Iterate over the lines
        for line in file:
            # Strip leading and trailing whitespace
            line = line.strip()
            # Check if the line contains column names
            if line.startswith(first_var):
                # Split the line by whitespace and append to the columns list
                columns = line[1:].strip().split("\t")
                break
            skip+=1
    # Read the data into a DataFrame, skipping the first 6 rows of comments
    d = pd.read_csv(path, sep='\t', skiprows=skip, names=columns)
    #Check for duplicated metadata, remove metadata rows based on string "OPC SN"
    if len(d)>0:
        if isinstance(d.iloc[0,0], str):
            dup_meta = [n for n, i in enumerate(d.iloc[:,0]) if 'OPC SN' in i]
            if len(dup_meta) > 0:
                for line in dup_meta:
                    #Deletes duplicate metadata rows from dataframe
                    d.drop(np.arange(line,line+68), inplace = True)
                    d = d.apply(pd.to_numeric, errors='ignore')
                    
    # Creates datetime columns
    if 'DOY.Frac' in d.keys():
        d['dt'] = pd.to_datetime('2024-1-1') + pd.to_timedelta(d['DOY.Frac'], unit='D') - pd.Timedelta(days=1)
    if 'StartTimeSt' in d.keys():
        d['st_dt'] = pd.to_datetime('2024-1-1') + pd.to_timedelta(d['StartTimeSt'], unit='D') - pd.Timedelta(days=1)
    if 'EndTimeSt' in d.keys():
        d['end_dt'] = pd.to_datetime('2024-1-1') + pd.to_timedelta(d['EndTimeSt'], unit='D') - pd.Timedelta(days=1)
    if 'YY/MM/DD' and 'HR:MN:SC' in d.keys():
        d['dt'] = pd.to_datetime(str(20) + d['YY/MM/DD'] + ' ' + d['HR:MN:SC'], format='%Y/%m/%d %H:%M:%S')
    return d

In [335]:
def glob_reader(file_key, first_var, subfolder = './data/'):
    '''
    Reads groups of data files and merges them into one

    :param file_key: shared key in filenames
    :param first_var: the name of the first column label
    :param subfolder: name of the subfolder containing the data
    :return: pandas DataFrame
    '''
    paths = sorted(glob.glob(subfolder+'*'+file_key+'*'))
    d = []
    for i in range(0, len(paths)):
        d.append(reader(paths[i], first_var))
    d = pd.concat(d).reset_index()
    return d

In [336]:
def add_bin_sum(d):
    d = d.copy()
    col_w_bin = [col for col in d.columns if 'bin' in col]
    d['bin_sum'] = d[col_w_bin].sum(numeric_only = True, axis=1)
    return d

In [337]:
def calc_dNdlogDp(data, bins):
    #Calculating dN/dlogDp for OPCs; requires dlogDp to be calculated for bins
    dNdlogDp = []
    for binN in bins.index[:-1]:
        dNdlogDp.append(data[f'bin{binN}']/bins.loc[binN, 'dlogDp'])
    dNdlogDp = pd.concat(dNdlogDp, axis = 1)
    dNdlogDp.columns = [f'{i}_norm' for i in dNdlogDp.columns]
        #Norm = /dlogDp
    return dNdlogDp

In [338]:
bins = pd.read_csv('./DASH_Bins_2023.csv').set_index('BinNum')

Read OPC Data

In [339]:
dopc = add_bin_sum(glob_reader('OPC_210', '#YY/MM/DD', subfolder = '../data/DASH_test-fight-240517/'))
hopc = add_bin_sum(glob_reader('OPC_212', '#YY/MM/DD', subfolder = '../data/DASH_test-fight-240517/'))
dash = glob_reader('DASH_FLOW', '#DOY.Frac', subfolder = '../data/DASH_test-fight-240517/')
sems = glob_reader('SEMS_DATA', '#DOY.Frac', subfolder = '../data/DASH_test-fight-240517/')

In [340]:
merged = pd.merge_asof(dash, sems, on='dt')

In [341]:
merged = pd.merge_asof(merged, dopc, on='dt').drop(columns=['index_x'])

In [342]:
merged = pd.merge_asof(merged, hopc, on='dt')

In [343]:
merged_diff = merged[['dt', 'UpSt_Dia', 'HO_RH']].diff()

In [344]:
merged[['dt.diff', 'UpSt_Dia.diff', 'HO_RH.diff']] = merged_diff

In [345]:
col_w_bin = [c for c in merged.columns if 'bin' in c and 'sum' not in c]

new_group = True
start_i = []
end_i = []
for i in range(0,len(merged)):
    row = merged.iloc[i]
    if new_group:
        start_i.append(i)
        new_group = False
    if abs(row['UpSt_Dia.diff']) > 0:
        end_i.append(i)
        new_group = True
    elif abs(row['HO_RH.diff']) > 5:
        end_i.append(i)
        new_group = True
    if i == len(merged)-1:
        if len(start_i) > len(end_i):
            end_i.append(i)

In [346]:
col = ['Time_Start', 'Time_End', 'Dp', 'RH']
comb_bins = []
dopc_labels = []
hopc_labels = []
for i in range(1,73):
    if i < 10:
        dopc_labels.append('D_Bin0'+str(i))
        hopc_labels.append('H_Bin0'+str(i))
    else:
        dopc_labels.append('D_Bin'+str(i))
        hopc_labels.append('H_Bin'+str(i))
    
    if i < 10:
        col.append('D_Bin0'+str(i))
        comb_bins.append('D_Bin0'+str(i))
    else:
        col.append('D_Bin'+str(i))
        comb_bins.append('D_Bin'+str(i))



for i in range(1,73):
    if i < 10:
        col.append('H_Bin0'+str(i))
        comb_bins.append('H_Bin0'+str(i))
    else:
        col.append('H_Bin'+str(i))
        comb_bins.append('H_Bin'+str(i))

In [347]:
output = pd.DataFrame(columns = col)

In [354]:
output = pd.DataFrame(columns = col)
out_acc = 0
for i in range(0,len(start_i)):
    subset = merged.iloc[start_i[i]:end_i[i]+1].reset_index()
    n_sum = 0
    dt_start = subset.loc[0,'dt']
    s = 0
    for j in range(0,len(subset)):
        n_sum += subset.loc[j, 'bin_sum_x']
        if subset.loc[j, 'dt.diff'] > timedelta(seconds=2):
            break
        if subset.loc[j, 'dt'] - dt_start > timedelta(minutes=5):
            break
        if n_sum > 100:
            output.loc[out_acc, 'Time_Start'] = subset.loc[s, 'dt']
            output.loc[out_acc, 'Time_End'] = subset.loc[j, 'dt']

            output.loc[out_acc, 'Dp'] = subset.loc[s:j+1, 'UpSt_Dia'].mean()
            output.loc[out_acc, 'RH'] = subset.loc[s:j+1, 'HO_RH'].mean()

            output.loc[out_acc, 'Dp_std'] = subset.loc[s:j+1, 'UpSt_Dia'].std()
            output.loc[out_acc, 'RH_std'] = subset.loc[s:j+1, 'HO_RH'].std()

            output.loc[out_acc, comb_bins] = list(subset.loc[s:j+1, col_w_bin].sum())
            
            s = j
            dt_start = subset.loc[s,'dt']
            n_sum = 0
            out_acc += 1

output = output[output['RH_std']<5]
output = output[output['Dp_std']==0]

In [360]:
(output['Time_End'] - output['Time_Start']).mean()

Timedelta('0 days 00:00:25.884985263')