In [1]:
import glob
import pandas as pd
from datetime import datetime
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
from datetime import timedelta

In [2]:
def simple_read(path):
    # path to data -> pandas DataFrame with time columns converted to date time
    with open(path) as f:
        # find the value in the file which tells you how many lines to skip to get to the table
        first_line = f.readline()
        header_line = int(first_line[0:-2].split(",")[0])-1
    data = pd.read_csv(path, sep=',', skiprows=header_line)

    # finds the location in the path containing the date
    acc = 0
    boo = False
    for letter in path:
        if letter == '2':
            boo = True
        elif boo and letter == '0':
            acc -= 1
            break
        acc += 1
        
    # creates datetime object with the date the data was collected
    dt = datetime(int(path[acc:acc+4]), int(path[acc+4:acc+6]), int(path[acc+6:acc+8])) 
    
    for column in data.keys():
        if 'Time' in column:
            # converts seconds after midnight columns to datetime
            data[column] = dt + pd.to_timedelta(data[column], unit='seconds')
    data.columns = data.columns.str.replace(' ', '')
    return data.replace(-9999, np.NaN) # Converts -9999 values to NaN


def add_leg(data, legs):
    '''
    add leg to the data file
    :param data: pandas data
    :param legs: pandas legs data
    :return: Pandas DataFrame with legs
    '''
    data=data.copy()
    # creates leg column
    data['leg'] = np.nan

    # leg codes corresponding to each leg type
    leg_key = {'00':'Takeoff/Landing', '01':'Transit', '02':'BCB', '03':'ACB', '04':'BCT', '05':'ACT', '06':'MinAlt', '07':'Ascent', '08':'Descent', '09':'Slant/Spiral', '10':'BBL', '11':'ABL', '12':'Remote Sensing (HSRL-2)', '13':'Other'}

    
    for i in range(0, len(legs)):
        subset = data[(data['Time_Mid'] >= legs.iloc[i]['Time_Start']) & (data['Time_Mid'] <= legs.iloc[i]['Time_Stop'])].copy()
        subset['leg'] = leg_key[str(legs.iloc[i]['LegIndex'])[-2:]]
        data.loc[subset.index, 'leg'] = subset['leg']

    return data
ccn_paths = sorted(glob.glob('./data/*CCN*'))
master_ccn = []
for i in range(0, len(ccn_paths)):
    master_ccn.append(simple_read(ccn_paths[i]))
master_ccn = pd.concat(master_ccn).reset_index()
ams_paths = sorted(glob.glob('./data/*AMS_*'))
master_ams = []
for i in range(0, len(ams_paths)):
    master_ams.append(simple_read(ams_paths[i]))
master_ams = pd.concat(master_ams).reset_index()
las_paths = sorted(glob.glob('./data/*LAS*'))
master_las = []
for i in range(0, len(las_paths)):
    master_las.append(simple_read(las_paths[i]))
master_las = pd.concat(master_las).reset_index()
smps_paths = sorted(glob.glob('./data/*SMPS*'))
master_smps = []
for i in range(0, len(smps_paths)):
    master_smps.append(simple_read(smps_paths[i]))
master_smps = pd.concat(master_smps).reset_index()
sum_paths = sorted(glob.glob('./data/*SUMMARY*'))
master_sum = []
for i in range(0, len(sum_paths)):
    master_sum.append(simple_read(sum_paths[i]))
master_sum = pd.concat(master_sum).reset_index().rename(columns={'Time_mid':'Time_Mid'})
leg_paths = sorted(glob.glob('./data/*Leg*'))
master_leg = []
for i in range(0, len(leg_paths)):
    master_leg.append(simple_read(leg_paths[i]))
master_leg = pd.concat(master_leg).reset_index()
master_smps = add_leg(master_smps, master_leg)
fcdp_paths = sorted(glob.glob('./data/*FCDP*'))
master_fcdp = []
for i in range(0, len(fcdp_paths)):
    master_fcdp.append(simple_read(fcdp_paths[i]))
master_fcdp = pd.concat(master_fcdp).reset_index()
master_fcdp = master_fcdp[['Time_Start', 'LWC_FCDP']]
v_paths = sorted(glob.glob('./data/*2DS-V*'))
master_v = []
for i in range(0, len(v_paths)):
    master_v.append(simple_read(v_paths[i]))
master_v = pd.concat(master_v).reset_index()
master_2ds = master_v[['Time_Start', 'ED-liquid_2DS', 'Ice_Flag_2DS']]

In [3]:
def process_row(i):
    row = master_smps.iloc[i]
    t_start = row['Time_Start']
    t_stop = row['Time_Stop']
    ccn_row = master_ccn[(master_ccn['Time_mid'] >= t_start) & (master_ccn['Time_mid'] <= t_stop)].median(numeric_only=True)
    las_row = master_las[(master_las['Time_Start'] >= t_start) & (master_las['Time_Start'] <= t_stop)].median(numeric_only=True)
    ams_row = master_ams[(master_ams['Time_Start'] >= t_start-timedelta(seconds=5)) & (master_ams['Time_Stop'] <= t_stop+timedelta(seconds=5))].mean(numeric_only=True)
    sum_row = master_sum[(master_sum['Time_Mid'] >= t_start) & (master_sum['Time_Mid'] <= t_stop)].median(numeric_only=True)
    fcdp_row = master_fcdp[(master_fcdp['Time_Start'] >= t_start) & (master_fcdp['Time_Start'] <= t_stop)].max(numeric_only=True)
    v_row = master_2ds[(master_2ds['Time_Start'] >= t_start) & (master_2ds['Time_Start'] <= t_stop)].max(numeric_only=True)
    return ccn_row, las_row, ams_row, sum_row, fcdp_row, v_row

results = Parallel(n_jobs=-1)(delayed(process_row)(i) for i in tqdm(range(0, len(master_smps))))

ccn_mean, las_mean, ams_mean, sum_mean, fcdp_mean, v_mean = zip(*results)
ccn_mean = pd.DataFrame(list(ccn_mean))
las_mean = pd.DataFrame(list(las_mean))
ams_mean = pd.DataFrame(list(ams_mean))
sum_mean = pd.DataFrame(list(sum_mean))
fcdp_mean = pd.DataFrame(list(fcdp_mean))
v_mean = pd.DataFrame(list(v_mean))

  0%|          | 0/34025 [00:00<?, ?it/s]

100%|██████████| 34025/34025 [06:58<00:00, 81.29it/s] 


In [4]:
ams_mean

Unnamed: 0,index,Org_Ave_IsoK_STP,SO4_Ave_IsoK_STP,NO3_Ave_IsoK_STP,NH4_Ave_IsoK_STP,Chl_Ave_IsoK_STP,mz42_Ave_IsoK_STP,mz43_Ave_IsoK_STP,mz44_Ave_IsoK_STP,mz55_Ave_IsoK_STP,mz57_Ave_IsoK_STP,mz58_Ave_IsoK_STP,mz60_Ave_IsoK_STP,mz79_Ave_IsoK_STP,mz91_Ave_IsoK_STP
0,2.0,0.902,1.186,0.706,0.815,0.011,0.042,0.101,0.181,0.077,0.034,-0.005,0.008,0.001,0.014
1,4.0,1.183,1.423,0.289,0.779,0.011,0.000,0.108,0.187,0.043,0.013,0.033,-0.004,0.014,0.002
2,6.0,0.779,1.582,0.317,0.761,-0.012,0.006,0.099,0.105,0.030,0.019,0.035,0.005,0.008,-0.006
3,8.0,0.223,0.982,0.168,0.572,-0.023,0.003,0.148,-0.054,0.068,-0.006,-0.011,-0.055,0.014,-0.001
4,10.0,0.494,1.181,0.285,0.640,0.041,0.063,0.085,-0.001,-0.034,0.028,-0.008,-0.026,0.004,0.005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34020,372.0,6.852,1.048,0.336,0.512,-0.019,0.262,0.900,0.827,0.241,0.105,0.076,0.028,0.062,0.056
34021,374.0,6.443,0.930,0.368,0.504,0.028,0.255,0.846,0.742,0.211,0.076,0.064,0.051,0.093,0.069
34022,376.0,6.588,1.014,0.395,0.572,0.061,0.255,0.901,0.800,0.238,0.064,0.098,0.016,0.064,0.068
34023,378.0,7.012,1.066,0.402,0.537,0.021,0.275,0.926,0.880,0.268,0.053,0.074,-0.008,0.040,0.045


In [5]:
merged = pd.merge(master_smps[['Time_Mid', 'SMPS_Bin02', 'SMPS_Bin03', 'SMPS_Bin04',
       'SMPS_Bin05', 'SMPS_Bin06', 'SMPS_Bin07', 'SMPS_Bin08', 'SMPS_Bin09',
       'SMPS_Bin10', 'SMPS_Bin11', 'SMPS_Bin12', 'SMPS_Bin13', 'SMPS_Bin14',
       'SMPS_Bin15', 'SMPS_Bin16', 'SMPS_Bin17', 'SMPS_Bin18', 'SMPS_Bin19',
       'SMPS_Bin20', 'SMPS_Bin21', 'SMPS_Bin22', 'SMPS_Bin23', 'SMPS_Bin24',
       'SMPS_Bin25', 'SMPS_Bin26', 'SMPS_Bin27', 'SMPS_Bin28', 'SMPS_Bin29',
       'SMPS_Bin30', 'leg']], ams_mean[['Org_Ave_IsoK_STP', 'SO4_Ave_IsoK_STP',
       'NO3_Ave_IsoK_STP', 'NH4_Ave_IsoK_STP', 'Chl_Ave_IsoK_STP',
       'mz42_Ave_IsoK_STP', 'mz43_Ave_IsoK_STP', 'mz44_Ave_IsoK_STP',
       'mz55_Ave_IsoK_STP', 'mz57_Ave_IsoK_STP', 'mz58_Ave_IsoK_STP',
       'mz60_Ave_IsoK_STP', 'mz79_Ave_IsoK_STP', 'mz91_Ave_IsoK_STP']], left_index=True, right_index=True)
merged = pd.merge(merged, ccn_mean[['CCN_SS', 'N_CCN_stdPT']], left_index=True, right_index=True)
merged = pd.merge(merged, las_mean[['LAS_Bin01', 'LAS_Bin02', 'LAS_Bin03', 'LAS_Bin04', 'LAS_Bin05',
       'LAS_Bin06', 'LAS_Bin07', 'LAS_Bin08', 'LAS_Bin09', 'LAS_Bin10',
       'LAS_Bin11', 'LAS_Bin12', 'LAS_Bin13', 'LAS_Bin14', 'LAS_Bin15',
       'LAS_Bin16', 'LAS_Bin17', 'LAS_Bin18', 'LAS_Bin19', 'LAS_Bin20',
       'LAS_Bin21', 'LAS_Bin22', 'LAS_Bin23', 'LAS_Bin24', 'LAS_Bin25',
       'LAS_Bin26']], left_index=True, right_index=True)
merged = pd.merge(merged, sum_mean[['Latitude', 'Longitude', 'GPS_altitude',
       'Pressure_Altitude', 'Pitch', 'Roll', 'True_Heading', 'True_Air_Speed',
       'Static_Air_Temp', 'IR_Surf_Temp', 'Static_Pressure', 'Wind_Speed',
       'Wind_Direction']], left_index=True, right_index=True)
merged = pd.merge(merged, fcdp_mean[['LWC_FCDP']], left_index=True, right_index=True)
merged = pd.merge(merged, v_mean[['ED-liquid_2DS', 'Ice_Flag_2DS']], left_index=True, right_index=True)

In [6]:
merged.to_csv('merged_smps.csv', index=False)

In [7]:
merged.keys()

Index(['Time_Mid', 'SMPS_Bin02', 'SMPS_Bin03', 'SMPS_Bin04', 'SMPS_Bin05',
       'SMPS_Bin06', 'SMPS_Bin07', 'SMPS_Bin08', 'SMPS_Bin09', 'SMPS_Bin10',
       'SMPS_Bin11', 'SMPS_Bin12', 'SMPS_Bin13', 'SMPS_Bin14', 'SMPS_Bin15',
       'SMPS_Bin16', 'SMPS_Bin17', 'SMPS_Bin18', 'SMPS_Bin19', 'SMPS_Bin20',
       'SMPS_Bin21', 'SMPS_Bin22', 'SMPS_Bin23', 'SMPS_Bin24', 'SMPS_Bin25',
       'SMPS_Bin26', 'SMPS_Bin27', 'SMPS_Bin28', 'SMPS_Bin29', 'SMPS_Bin30',
       'leg', 'Org_Ave_IsoK_STP', 'SO4_Ave_IsoK_STP', 'NO3_Ave_IsoK_STP',
       'NH4_Ave_IsoK_STP', 'Chl_Ave_IsoK_STP', 'mz42_Ave_IsoK_STP',
       'mz43_Ave_IsoK_STP', 'mz44_Ave_IsoK_STP', 'mz55_Ave_IsoK_STP',
       'mz57_Ave_IsoK_STP', 'mz58_Ave_IsoK_STP', 'mz60_Ave_IsoK_STP',
       'mz79_Ave_IsoK_STP', 'mz91_Ave_IsoK_STP', 'CCN_SS', 'N_CCN_stdPT',
       'LAS_Bin01', 'LAS_Bin02', 'LAS_Bin03', 'LAS_Bin04', 'LAS_Bin05',
       'LAS_Bin06', 'LAS_Bin07', 'LAS_Bin08', 'LAS_Bin09', 'LAS_Bin10',
       'LAS_Bin11', 'LAS_Bin12', 'LAS_Bi

In [8]:
merged[np.isfinite(merged['Org_Ave_IsoK_STP'])]

Unnamed: 0,Time_Mid,SMPS_Bin02,SMPS_Bin03,SMPS_Bin04,SMPS_Bin05,SMPS_Bin06,SMPS_Bin07,SMPS_Bin08,SMPS_Bin09,SMPS_Bin10,...,True_Heading,True_Air_Speed,Static_Air_Temp,IR_Surf_Temp,Static_Pressure,Wind_Speed,Wind_Direction,LWC_FCDP,ED-liquid_2DS,Ice_Flag_2DS
0,2020-02-14 17:02:30.500,,,,,,,,,,...,69.215,101.110,0.045,8.875,945.950,4.805,344.880,,,
1,2020-02-14 17:03:30.500,,,,,,,,,,...,73.470,117.925,-5.520,6.490,863.165,7.245,273.645,,,
2,2020-02-14 17:04:30.500,,,,,,,,,,...,102.045,136.070,-6.965,6.240,846.465,6.940,258.870,0.0,0.0,0.0
3,2020-02-14 17:05:30.500,,,,,,,,,,...,101.655,128.285,-6.830,6.250,820.210,10.130,263.100,0.0,0.0,0.0
4,2020-02-14 17:06:30.500,,,,,,,,,,...,102.130,132.615,-7.115,7.015,820.255,8.810,258.840,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34020,2022-06-18 15:11:01.500,0.0,0.0,0.0,0.00,0.0,640.37,0.00,0.00,0.00,...,272.660,134.245,19.280,23.465,938.885,,,0.0,0.0,
34021,2022-06-18 15:12:01.500,0.0,0.0,0.0,3172.65,0.0,635.06,0.00,685.19,249.95,...,273.200,127.645,22.805,23.065,985.050,10.640,326.890,0.0,0.0,
34022,2022-06-18 15:13:01.500,0.0,0.0,0.0,0.00,0.0,0.00,434.85,653.39,0.00,...,271.870,112.610,23.095,23.570,989.230,10.525,331.515,0.0,0.0,
34023,2022-06-18 15:14:01.500,0.0,0.0,0.0,0.00,0.0,591.20,0.00,0.00,0.00,...,271.570,129.915,23.015,22.785,987.855,11.180,335.100,0.0,0.0,
