# CSN Modelling: Part 1

## Setup

In [170]:
# data wrangling
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.model_selection import train_test_split

# models
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import statsmodels.formula.api as smf
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, KFold

# evaluation
from sklearn.metrics import mean_squared_error

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
%matplotlib inline

import os
sns.set_context('notebook')
sns.set_style('white')

In [112]:
def plot_fitted(true_value, predicted_value, title, save_path):
    plt.figure(figsize=(5,5))
    plt.scatter(true_value, predicted_value, s=5, c="crimson")
    plt.yscale('log')
    plt.xscale('log')

    p1 = max(max(predicted_value), max(true_value))
    p2 = min(min(predicted_value), min(true_value))
    plt.plot([p1, p2], [p1, p2], 'b-')
    plt.xlabel('True Values', fontsize=11)
    plt.ylabel('Predictions', fontsize=11)
    plt.axis('equal')
    plt.title(title)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.show()

def get_perf(mod, preds, X_test, y_test, method):
    rmse = round(np.sqrt(mean_squared_error(y_test, preds)), 2)
    r2 = round(mod.score(X_test, y_test), 2)
    return rmse, r2

def plot_imp(mod, feature_list, title, save_path):
    # Specify feature names
    mod.get_booster().feature_names = feature_list
    # Plot in a graph with top 11 features
    ax = xgb.plot_importance(mod.get_booster(), max_num_features=10)
    plt.tight_layout()
    plt.title(title)
    ax.figure.savefig(save_path)

In [75]:
csn_misr = pd.read_csv('../data/merged/WithCL_MISR_CSN_2000_2021.csv')

In [150]:
df = csn_misr.dropna(subset = ['AOD'])
df.reset_index(drop=True, inplace=True)

In [151]:
percent_dropped = round((1 - (df.shape[0]/csn_misr.shape[0]))*100, 2)
print("Original data dimension:",csn_misr.shape[0], "observations,", csn_misr.shape[1], "features.")
print("New data dimension:",df.shape[0], "observations,", df.shape[1], "features.")
print(f"Percent dropped: {percent_dropped}")

Original data dimension: 5289 observations, 312 features.
New data dimension: 1709 observations, 312 features.
Percent dropped: 67.69


In [None]:
list(df.columns)

## Feature Engineering

In [None]:
# Create year, month variables
df['Year'] = pd.DatetimeIndex(df.loc[:, 'Date']).year.astype('category')
df['Month'] = pd.DatetimeIndex(df.loc[:, 'Date']).month.astype('category')

# Cast other categorical variables to the right type

for col in ['POC', 'Site.Code', 'light', 'med', 'heavy', 'ecosys']:
    df[col] = df.loc[:, col].astype('category')

# Replace NAs in smoke vars with 0
df['light'].fillna(0, inplace=True)
df['med'].fillna(0, inplace=True)
df['heavy'].fillna(0, inplace=True)
df['ecosys'].cat.add_categories(-1).fillna(-1)

# Replace NAs in cluster info vars with 0
df['frp_avg'].fillna(-999, inplace=True)
df['frp_vars'].fillna(-999, inplace=True)
df['num_pts'].fillna(0, inplace=True)

# First type: create quantiles for fire distance; placeholder "x+1"th quantile for NAs
df['fire_dist_25'] = pd.qcut(df['fire_dist'], q=[0, .25, .5, .75, 1.], labels=[1,2,3,4])
df['fire_dist_25'] = df['fire_dist_25'].cat.add_categories(5)
df['fire_dist_25'].fillna(5, inplace=True)

In [None]:
for el in ['Al', 'Si', 'Ca', 'Ti', 'Fe']:
    df[el].fillna(0, inplace=True)
df['dust'] = 2.2*df['Al'] + 2.49*df['Si'] + 1.63*df['Ca'] + 1.94*df['Ti'] + 2.42*df['Fe']

In [165]:
df['dust'].isnull().sum()

0

## Modelling for Species

In [162]:
ADD_FEATS = {'none': [], 'aod_only':['AOD'], 'raw_aod_etc':['AOD', 'AOD_absorption', 'AOD_nonspherical', 'small_mode_AOD', \
    'medium_mode_AOD', 'large_mode_AOD'], 'aod_mix':['aod_mix_01', 'aod_mix_02', 'aod_mix_03', 'aod_mix_04', \
    'aod_mix_05', 'aod_mix_06', 'aod_mix_07', 'aod_mix_08', 'aod_mix_09', 'aod_mix_10', 'aod_mix_11', 'aod_mix_12', \
    'aod_mix_13', 'aod_mix_14', 'aod_mix_15', 'aod_mix_16', 'aod_mix_17', 'aod_mix_18', 'aod_mix_19', 'aod_mix_20', \
    'aod_mix_21', 'aod_mix_22', 'aod_mix_23', 'aod_mix_24', 'aod_mix_25', 'aod_mix_26', 'aod_mix_27', 'aod_mix_28', \
    'aod_mix_29', 'aod_mix_30', 'aod_mix_31', 'aod_mix_32', 'aod_mix_33', 'aod_mix_34', 'aod_mix_35', 'aod_mix_36', \
    'aod_mix_37', 'aod_mix_38', 'aod_mix_39', 'aod_mix_40', 'aod_mix_41', 'aod_mix_42', 'aod_mix_43', 'aod_mix_44', \
    'aod_mix_45', 'aod_mix_46', 'aod_mix_47', 'aod_mix_48', 'aod_mix_49', 'aod_mix_50', 'aod_mix_51', 'aod_mix_52', \
    'aod_mix_53', 'aod_mix_54', 'aod_mix_55', 'aod_mix_56', 'aod_mix_57', 'aod_mix_58', 'aod_mix_59', 'aod_mix_60', \
    'aod_mix_61', 'aod_mix_62', 'aod_mix_63', 'aod_mix_64', 'aod_mix_65', 'aod_mix_66', 'aod_mix_67', 'aod_mix_68', \
    'aod_mix_69', 'aod_mix_70', 'aod_mix_71', 'aod_mix_72', 'aod_mix_73', 'aod_mix_74']}
MODEL_FEATS = {}
EXP_NAME = {'none': 'No AOD', 'aod_only': 'AOD Only', 'raw_aod_etc': 'Raw AOD & etc.', 'aod_mix': 'AOD Mix'}

In [167]:
def run_exp(df, y, feature_list, test_size, exp_type, outf, dir):
    one_hot = pd.get_dummies(df[feature_list], columns = ['POC', 'fire_dist_25'])
    X_one_hot = np.array(one_hot)
    MODEL_FEATS[exp_type] = list(one_hot.columns)
    X_train, X_test, y_train, y_test = train_test_split(X_one_hot, y, test_size = 0.3, random_state = 88)
    xgb_mod = xgb.XGBRegressor(objective="reg:squarederror", subsample=1.0, min_child_weight=1, max_depth=10, gamma=1.5, colsample_bytree=1.0, random_state=88)
    xgb_mod.fit(X_train, y_train)
    preds_xgb = xgb_mod.predict(X_test)
    rmse, r2 = get_perf(xgb_mod, preds_xgb, X_test, y_test, exp_type)

    outf.write(f'---- Experiment type: {EXP_NAME[exp_type]} ----\n')
    outf.write(f'Training Features Shape: {X_train.shape}\n')
    outf.write(f'Training Labels Shape: {y_train.shape}\n')
    outf.write(f'Testing Features Shape: {X_test.shape}\n')
    outf.write(f'Testing Labels Shape: {y_test.shape}\n')
    outf.write(f'-- Performance --\n')
    outf.write(f'RMSE: {rmse}\n')
    outf.write(f'R2: {r2}\n')
    ft_img_name = "ft-imp_" + exp_type + ".png"
    fitted_name = "pred-true_" + exp_type + ".png"
    plot_imp(xgb_mod, MODEL_FEATS[exp_type], "Feature importance: " + EXP_NAME[exp_type], os.path.join(dir, ft_img_name))
    plot_fitted(y_test, preds_xgb, "Pred vs True: " + EXP_NAME[exp_type], os.path.join(dir, fitted_name))


In [168]:
subset_features = ['Year', 'Month', 'POC', 'Site.Latitude', 'Site.Longitude', \
    'elevation', 'fire_dist_25', 'light', 'med', 'heavy','frp_avg', 'frp_vars', \
    'num_pts', 'ecosys']

In [None]:
with open(f"model_results.txt", "w") as outf:
    for sp in ['sulfate', 'nitrate', 'dust', 'EC_unadjusted', 'OC']:
        df[sp].fillna(0, inplace=True)
        labels = np.array(df[sp])
        outf.write(f'****** Species: {sp} ******\n')
        img_dir_path = os.path.join("../img/", sp)
        if not os.path.exists(img_dir_path):
            os.mkdir(img_dir_path)
        for exp_type in ADD_FEATS.keys():
            ft_lst = subset_features + ADD_FEATS[exp_type]
            run_exp(df, labels, ft_lst, 0.3, exp_type, outf, img_dir_path)
        outf.write(f'================================================\n')
outf.close()
        

### Dust

In [171]:
def run_kfold(df, y, feature_list, test_size, exp_type, outf, dir):
    one_hot = pd.get_dummies(df[feature_list], columns = ['POC', 'fire_dist_25'])
    X_one_hot = np.array(one_hot)
    MODEL_FEATS[exp_type] = list(one_hot.columns)
    kf = KFold(n_splits=5)
    rmse_lst = []
    r2_lst = []
    for i, (train_ind, test_ind) in enumerate(kf.split(X_one_hot,y)):
        X_train = X_one_hot[train_ind]
        X_test = X_one_hot[test_ind]
        y_train = y[train_ind]
        y_test = y[test_ind]
        xgb_mod = xgb.XGBRegressor(objective="reg:squarederror", subsample=1.0, min_child_weight=1, max_depth=10, gamma=1.5, colsample_bytree=1.0, random_state=88)
        xgb_mod.fit(X_train, y_train)
        preds_xgb = xgb_mod.predict(X_test)
        rmse, r2 = get_perf(xgb_mod, preds_xgb, X_test, y_test, exp_type)
        rmse_lst.append(rmse)
        r2_lst.append(r2)
        # ft_img_name = "ft-imp_" + exp_type + ".png"
        # fitted_name = "pred-true_" + exp_type + ".png"
        # plot_imp(xgb_mod, MODEL_FEATS[exp_type], "Feature importance: " + EXP_NAME[exp_type], os.path.join(dir, ft_img_name))
        # plot_fitted(y_test, preds_xgb, "Pred vs True: " + EXP_NAME[exp_type], os.path.join(dir, fitted_name))
    outf.write(f'---- Experiment type: {EXP_NAME[exp_type]} ----\n')
    outf.write(f'Obs/Fold in Train: {len(train_ind)}\n')
    outf.write(f'Obs/Fold in Test: {len(test_ind)}\n')
    outf.write(f'-- Performance --\n')
    outf.write(f'RMSE: {sum(rmse_lst)/len(rmse_lst)}\n')
    outf.write(f'R2: {sum(r2_lst)/len(r2_lst)}\n')
        

In [172]:
with open(f"model_kfold_results.txt", "w") as outf:
    sp = 'dust'
    df[sp].fillna(0, inplace=True)
    labels = np.array(df[sp])
    outf.write(f'****** Species: {sp} ******\n')
    img_dir_path = os.path.join("../img/", sp)
    if not os.path.exists(img_dir_path):
        os.mkdir(img_dir_path)
    for exp_type in ADD_FEATS.keys():
        ft_lst = subset_features + ADD_FEATS[exp_type]
        run_kfold(df, labels, ft_lst, 0.3, exp_type, outf, img_dir_path)
    outf.write(f'================================================\n')
outf.close()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sp].fillna(0, inplace=True)


### EC (Elemental Carbon)

In [25]:
labels1 = np.array(df['EC_unadjusted'])
labels2 = np.array(df['EC'])

In [55]:
print("NAs in EC: ", csn_misr['EC'].isnull().sum())
print("NAs in EC_unadj: ", csn_misr['EC_unadjusted'].isnull().sum())

NAs in EC:  4091
NAs in EC_unadj:  2193


In [61]:
ind_ec = csn_misr['EC'].index[csn_misr['EC'].apply(np.isnan)]
ind_unadj = csn_misr['EC_unadjusted'].index[csn_misr['EC_unadjusted'].apply(np.isnan)]

In [67]:
# Difference in NAs for EC & EC_unadj
len(set(ind_unadj) - set(ind_ec))/len(set(ind_unadj))

0.13588691290469676