In [26]:
import pandas as pd
import glob
from datetime import datetime
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm

In [17]:
def simple_read(path):
    '''
    Reads .ict files to a Pandas DataFrame
    :param path: path to the .ict data
    :return: Pandas DataFrame with .ict data
    '''
    with open(path) as f:
        # find the value in the file which tells you how many lines to skip to get to the table
        first_line = f.readline()
        header_line = int(first_line[0:-2].split(",")[0])-1
    data = pd.read_csv(path, sep=',', skiprows=header_line)

    # finds the location in the path containing the date
    acc = 0
    boo = False
    for letter in path:
        if letter == '2':
            boo = True
        elif boo and letter == '0':
            acc -= 1
            break
        acc += 1
        
    # creates datetime object with the date the data was collected
    dt = datetime(int(path[acc:acc+4]), int(path[acc+4:acc+6]), int(path[acc+6:acc+8])) 
    
    for column in data.keys():
        if 'Time' in column:
            # converts seconds after midnight columns to datetime
            data[column] = dt + pd.to_timedelta(data[column], unit='seconds')
    data.columns = data.columns.str.replace(' ', '')
    return data.replace(-8888, np.nan) # Converts -9999 values to NaN

In [18]:
pils_paths = sorted(glob.glob('../data/*PILS*'))
master_pils = []
for i in range(0, len(pils_paths)):
    master_pils.append(simple_read(pils_paths[i]))
master_pils = pd.concat(master_pils).reset_index()

In [20]:
m = pd.read_csv('../tables/merged_final_non_org_fixed.csv', parse_dates=['Time_Mid'])

In [28]:
def process_row(i):
    # finds the mean/median/max using the SMPS start and end times (because that has the lowest time resolution)
    row = master_pils.iloc[i]
    t_start = row['Time_Start']
    t_stop = row['Time_Stop']
    m_row = m[(m['Time_Mid'] >= t_start) & (m['Time_Mid'] <= t_stop)].median(numeric_only=True)
    return m_row

results = Parallel(n_jobs=-1)(delayed(process_row)(i) for i in tqdm(range(0, len(master_pils))))

100%|██████████| 4665/4665 [00:01<00:00, 2433.19it/s]


In [34]:
m = pd.DataFrame(list(results))

In [46]:
d = pd.merge(m, master_pils[['Time_Mid', 'Sodium', 'Potassium',
       'Magnesium', 'Calcium', 'Chloride', 'Nitrate', 'Sulfate', 'Oxalate',
       'Ammonium']], left_index=True, right_index=True)

In [47]:
transit_flights = [pd.Timestamp('2022-03-22').date(), pd.Timestamp('2022-05-18').date(), pd.Timestamp('2022-05-21').date(), pd.Timestamp('2022-05-31').date(), pd.Timestamp('2022-06-18').date()]
df_winter = d[d['Time_Mid'].dt.month.isin([11, 12, 1, 2, 3, 4]) & (~d['Time_Mid'].dt.date.isin(transit_flights))]
df_spring = d[d['Time_Mid'].dt.month.isin([5, 6]) & (np.logical_not((d['Time_Mid'].dt.month.isin([6])) & (d['Time_Mid'].dt.year.isin([2022])))) & (~d['Time_Mid'].dt.date.isin(transit_flights))]
df_summer = d[d['Time_Mid'].dt.month.isin([8, 9]) & (~d['Time_Mid'].dt.date.isin(transit_flights))]
df_bermuda = d[(d['Time_Mid'].dt.month.isin([6])) & (d['Time_Mid'].dt.year.isin([2022])) & (~d['Time_Mid'].dt.date.isin(transit_flights))]

In [48]:
d_seg = [df_winter, df_spring, df_summer, df_bermuda]

In [51]:
df_winter['ams_tot']

0         NaN
1         NaN
2         NaN
3       2.641
4       2.685
        ...  
3585      NaN
3586      NaN
3587      NaN
3588      NaN
3589      NaN
Name: ams_tot, Length: 2233, dtype: float64