In [1]:
### this is a comprehensive file on how my final train - test set was created -- Blake Hartung 11/16/2022
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt

In [6]:
def mix_data(row, norm_depths):
    norm_values = list()
    for depth in norm_depths:
        mid = np.floor(depth).astype('int64')
        perc_up = 1 - (depth - mid)
        perc_down = 1 - perc_up
        chla = (row.iloc[mid-1]*perc_down)+(row.iloc[mid]*perc_up)
        norm_values.append(chla)
    return norm_values

In [3]:
PROFILE_DATA_PATH = "../data/profileData.csv"
SAT_DATA_PATH = "../data/satData.csv"

profile_df = pd.read_csv(PROFILE_DATA_PATH)
sat_df = pd.read_csv(SAT_DATA_PATH)

def build_binned_data(profile_df, sat_df, save_path=None, return_df=True):
    drop_cols = ['LT_SAT_SST_SD', 'LT_SAT_SST_MED', 'LT_SAT_CHL_SD', 'LT_SAT_CHL_MED', 'LT_SAT_BBP_SD', 'LT_SAT_BBP_MED']
    # cut out outlying depth values
    profile_df = profile_df[profile_df.PRES < 1001]
    # bin the data using 100 bins and group it
    out = pd.cut(profile_df.PRES, bins=100, labels=[i for i in range(100)])
    profile_df['depth_bin'] = out

    depth_profiles = profile_df[['float', 'cycleNumber', 'depth_bin', 'CHLA', 'BBP700']] \
        .groupby(['float', 'cycleNumber', 'depth_bin']).mean().reset_index().dropna()

    df_depth = depth_profiles.merge(sat_df, on=['float', 'cycleNumber']).drop(drop_cols, axis=1).dropna()
    df_depth['date'] = pd.to_datetime(df_depth.date, format='%Y-%m-%d %H:%M:%S')
    # make sure we grab all bins
    bins_to_use = [i for i in range(100)]
    df = df_depth[df_depth.depth_bin.isin(bins_to_use)]
    df["depth_bin"] = df.loc[:, 'depth_bin'].astype("int64").astype("category")
    # turn date into radians
    df['date_doy'] = df.loc[:, 'date'].apply(lambda x: x.day_of_year)
    df['date_doy_rad'] = df.loc[:, 'date_doy'] * (np.pi /182.625)

    # round lat and lon to match new data
    df['latitude'] = df.latitude.apply(lambda x: np.around(x, 3))
    df['longitude'] = df.longitude.apply(lambda x: np.around(x, 3))

    if save_path != None:
        df.to_csv(save_path, index=False)
    if return_df:
        return df

In [None]:
df_pic = pd.read_csv('../data/sat_pic_full_final.csv').drop(['sat_pic_day', 'sat_pic_8d', 'short_date'], axis=1)
df_par = pd.read_csv('../data/par_full_final.csv').drop(['par_day', 'par_8d', 'short_date'], axis=1)
df_aph = pd.read_csv('../data/aph_443_full_final.csv').drop(['aph_443_day', 'aph_443_8d', 'short_date'], axis=1)


def build_pivoted_imputed_data(df_binned, df_pic, df_par, df_aph, save_path=None, return_df=True):
    new_features_df = df_pic.merge(df_par, on=['latitude', 'longitude'])
    new_features_df = new_features_df.merge(df_aph, on=['latitude', 'longitude'])

    piv_df = df.merge(new_features_df, on=['latitude', 'longitude']).dropna().drop_duplicates()
    pivoted_ls = list()
    curr_float, curr_cycle = -1, -1
    pivot_row = ''
    for i, r in piv_df.iterrows():
        # new cycle
        if (curr_float != r.float) and (curr_cycle != r.cycleNumber):
            pivoted_ls.append(pivot_row)
            curr_float = r.float
            curr_cycle = r.cycleNumber
            pivot_row = {
                'float': curr_float,
                'cycleNumber': curr_cycle,
                'latitude': r.latitude,
                'longitude': r.longitude,
                'date_doy_rad': r.date_doy_rad,
                'sat_chl_month': r.MO_SAT_CHL,
                'sat_sst_month': r.MO_SAT_SST,
                'sat_par_month': r.par_month,
                'sat_pic_month': r.sat_pic_month,
                'sat_aph_443_month': r.aph_443_month
            }
            pivot_row['depth_bin_' + str(r.depth_bin)] = r.CHLA
        # continuing from past cycle
        elif curr_cycle != r.cycleNumber:
            pivoted_ls.append(pivot_row)
            curr_cycle = r.cycleNumber
            pivot_row = {
                'float': curr_float,
                'cycleNumber': curr_cycle,
                'latitude': r.latitude,
                'longitude': r.longitude,
                'date_doy_rad': r.date_doy_rad,
                'sat_chl_month': r.MO_SAT_CHL,
                'sat_sst_month': r.MO_SAT_SST,
                'sat_par_month': r.par_month,
                'sat_pic_month': r.sat_pic_month,
                'sat_aph_443_month': r.aph_443_month
            }
            pivot_row['depth_bin_' + str(r.depth_bin)] = r.CHLA
        else:
            pivot_row['depth_bin_' + str(r.depth_bin)] = r.CHLA
    df = pd.DataFrame(pivoted_ls[1:])
    df.iloc[:, 10:] = IterativeImputer().fit_transform(df.iloc[:, 10:])
    if save_path != None:
        df.to_csv(save_path, index=False)
    if return_df:
        return df

In [9]:
# read in necessary data
df_pivoted = pd.read_csv('../data/pivoted_imp_data.csv')
df_clusters = pd.read_csv('../data/cluster_classification_assignment.csv', low_memory=False)
df_train_xgb = pd.read_csv('../data/xgb_train_preds.csv')
df_test_xgb = pd.read_csv('../data/xgb_test_preds.csv')

def build_normalized_data(df_pivoted, df_clusters, df_train_xgb, df_test_xgb, n_pts=10, save_path=None, return_df=True):
    # add train test tag for splitting later
    df_train_xgb['train'] = 1
    df_test_xgb['train'] = 0

    # get information on max depth and train test split using Josie's data
    df_norm_depth = pd.concat([df_train_xgb, df_test_xgb]).groupby(['float', 'cycle']).max().reset_index()\
        .rename(columns={'cycle': 'cycleNumber', 'PRES': 'max_depth'})[['float', 'cycleNumber', 'max_depth', 'train']]
    # merge with pivoted data I created
    df = df_pivoted.merge(df_norm_depth, on=['float', 'cycleNumber'])
    # add clusters from Gabby's data
    df = df.merge(df_clusters[['float', 'cycleNumber', 'cluster']],
                                    on=['float', 'cycleNumber']).drop_duplicates()
    print(f'Rows lost on specific train test split merger w max normalized depth\nand merger with cluster data: {df_pivoted.shape[0] - df.shape[0]}')

    # make an integer value representing the max bin to use in normalized data
    df['max_bin'] = (np.floor(df.max_depth / 10) + 1).astype('int64')

    norm_vals = {}
    for i in range(n_pts):
        norm_vals['norm_' + str(i)] = list()
    # iterate through dataframe and find the CHLA values at normalized depths using interpolation
    standard_depths = np.linspace(0, 1, n_pts)
    for i, r in df.iterrows():
        depth_chla = r[r.index.str.contains('depth_bin')].iloc[:r.max_bin + 1].to_numpy()
        norm_depths = np.linspace(0, 1, len(depth_chla))
        inter_fun = interp1d(norm_depths, depth_chla, kind='quadratic')
        vals = inter_fun(standard_depths)
        for i in range(n_pts):
            norm_vals['norm_' + str(i)].append(vals[i])

    # add the depths to the train test set
    for i in range(n_pts):
        df['norm_' + str(i)] = norm_vals['norm_' + str(i)]

    # factorize clusters
    cluster_nums, cluster_indeces = pd.factorize(df.cluster)
    df['cluster_val'] = cluster_nums

    if save_path != None:
        df.to_csv(save_path, index=False)
    if return_df:
        return df
build_normalized_data(df_pivoted, df_clusters, df_train_xgb, df_test_xgb, n_pts=10, save_path='../data/final_norm_set_10.csv', return_df=False)

Rows lost on specific train test split merger w max normalized depth
and merger with cluster data: 2374


In [None]:
df_known_dist = pd.read_csv('../data/cluster_chla_distribution.csv')

def build_normalized_cluster_centers(df_known_dist, plot_clusters=False, return_centers=True):
    out = pd.cut(df_known_dist.PRES, bins=25, labels=[i for i in range(25)])
    df_known_dist['depth_bin'] = out
    df_known_dist_binned = df_known_dist[['cluster', 'depth_bin', 'CHLA']] \
        .groupby(['cluster', 'depth_bin']).mean().reset_index().dropna()
    xnew = np.linspace(0, 1, 25)
    raw_dists = list()
    norm_cluster_depths = {
        'AR': 14,
        'EQ': 18,
        'HCB': 23,
        'LCB': 23,
        'PDCM': 23,
        'SDCM': 20
    }
    if plot_clusters:
        fig, ax = plt.subplots(2, 3, figsize=(20, 10), sharey=True)
    for i, c in enumerate(np.unique(df_known_dist_binned.cluster)):
        ydata = df_known_dist_binned[df_known_dist_binned.cluster == c]['CHLA'].to_numpy()[:norm_cluster_depths[c]]
        xdata = np.linspace(0, 1, len(ydata))
        fc = interp1d(xdata, ydata, kind='slinear')
        ynew = fc(xnew)
        raw_dists.append(ynew)
        if plot_clusters:
            ax[i // 3, i % 3].plot(xnew, ynew, label=c)
            ax[i //3, i % 3].legend()
    if return_centers:
        return raw_dists