# Windstorm project
This is a first out of several notebooks on Machine Learning-based prediction of severe surface winds associated with Extratropical Windstorms over different European geographical regions. Specifically, this study emphasizes how the temporal evolution characteristics of different storm internal and environmental predictors ("history") may contain useful information for quick evolution of severe wind potential overland.  

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
import pickle
import pandas as pd
import glob, os

# Add the path to the directory containing the module
import sys
sys.path.append('../../')
from util.ml import baseline, metrics

## Read data and trained models
In this notebook, we train a series of models with increasing complexity. To evaluate severe weather potential in a low sample size regime, we apply k-mean clustering to historical windstorm data and geospatial coordinates to identify coherent wind clusters within the European Union domain.
The severe wind potential of a particular windstorm can be defined by comparing the maximum near-surface wind of the storm after landfall in different clusters to their characteristics near-surface wind daily maxima climatologies. 

In [3]:
# Find the folder name organized by seed number
seed_docs = sorted(glob.glob('../../datas/seed_*/'))

data_dict = []
# Read files in a for loop
for seed_doc in seed_docs:
    # Read input data
    # Filter out the columns associated with convective_precipitation and large_scale_snowfall (model variables, not easily retrievable in observations)
    Xtrain_40 = baseline.filter_columns(pd.read_csv(seed_doc + 'X_train_40.csv'),"^(convective_precipitation_|large_scale_snowfall_)")
    Xtrain_30 = baseline.filter_columns(pd.read_csv(seed_doc + 'X_train_30.csv'),"^(convective_precipitation_|large_scale_snowfall_)")
    Xtrain_20 = baseline.filter_columns(pd.read_csv(seed_doc + 'X_train_20.csv'),"^(convective_precipitation_|large_scale_snowfall_)")
    Xvalid_40 = baseline.filter_columns(pd.read_csv(seed_doc + 'X_validation_40.csv'),"^(convective_precipitation_|large_scale_snowfall_)")
    Xvalid_30 = baseline.filter_columns(pd.read_csv(seed_doc + 'X_validation_30.csv'),"^(convective_precipitation_|large_scale_snowfall_)")
    Xvalid_20 = baseline.filter_columns(pd.read_csv(seed_doc + 'X_validation_20.csv'),"^(convective_precipitation_|large_scale_snowfall_)")

    # Read output data
    ytrain_cdf = pd.read_csv(seed_doc + 'y_train_cdf.csv')
    ytrain_max = pd.read_csv(seed_doc + 'y_train_max.csv')
    yvalid_cdf = pd.read_csv(seed_doc + 'y_validation_cdf.csv')
    yvalid_max = pd.read_csv(seed_doc + 'y_validation_max.csv')
    
    data = {
        'Xtrain_40': Xtrain_40,
        'Xtrain_30': Xtrain_30,
        'Xtrain_20': Xtrain_20,
        'Xvalid_40': Xvalid_40,
        'Xvalid_30': Xvalid_30,
        'Xvalid_20': Xvalid_20,
        'ytrain_cdf': ytrain_cdf,
        'ytrain_max': ytrain_max,
        'yvalid_cdf': yvalid_cdf,
        'yvalid_max': yvalid_max
    }
    data_dict.append(data)

# Find the name of the columns that we removed from the data
column_name_40 = pd.read_csv(seed_doc + 'X_train_40.csv').filter(regex="^(convective_precipitation_|large_scale_snowfall_)").columns
column_name_30 = pd.read_csv(seed_doc + 'X_train_30.csv').filter(regex="^(convective_precipitation_|large_scale_snowfall_)").columns
column_name_20 = pd.read_csv(seed_doc + 'X_train_20.csv').filter(regex="^(convective_precipitation_|large_scale_snowfall_)").columns

# All column indices
varname_40 = pd.read_csv(seed_doc + 'X_train_40.csv').columns
varname_30 = pd.read_csv(seed_doc + 'X_train_30.csv').columns
varname_20 = pd.read_csv(seed_doc + 'X_train_20.csv').columns

# Column indices for the removed variables
filtindex_40 = [list(varname_40).index(list(column_name_40)[i]) for i in range(len(list(column_name_40)))]
filtindex_30 = [list(varname_30).index(list(column_name_30)[i]) for i in range(len(list(column_name_30)))]
filtindex_20 = [list(varname_20).index(list(column_name_20)[i]) for i in range(len(list(column_name_20)))]

## Baseline models

### Linear Regression

In [68]:
# Train a linear regression model with the 20-most correlated inputs
linreg_20_cdf = []
linreg_20_max = []
for ind in range(len(data_dict)):
    lin_20_cdf = baseline.train_linear_regression(baseline.normalize_data(data_dict[ind]['Xtrain_20']), data_dict[ind]['ytrain_cdf'])
    lin_20_max = baseline.train_linear_regression(baseline.normalize_data(data_dict[ind]['Xtrain_20']), data_dict[ind]['ytrain_max'])
    linreg_20_cdf.append(lin_20_cdf)
    linreg_20_max.append(lin_20_max) 

# Train a linear regression model with the 30-most correlated inputs
linreg_30_cdf = []
linreg_30_max = []
for ind in range(len(data_dict)):
    lin_30_cdf = baseline.train_linear_regression(baseline.normalize_data(data_dict[ind]['Xtrain_30']), data_dict[ind]['ytrain_cdf'])
    lin_30_max = baseline.train_linear_regression(baseline.normalize_data(data_dict[ind]['Xtrain_30']), data_dict[ind]['ytrain_max'])
    linreg_30_cdf.append(lin_30_cdf)
    linreg_30_max.append(lin_30_max) 

# Train a linear regression model with the 40-most correlated inputs
linreg_40_cdf = []
linreg_40_max = []
for ind in range(len(data_dict)):
    lin_40_cdf = baseline.train_linear_regression(baseline.normalize_data(data_dict[ind]['Xtrain_40']), data_dict[ind]['ytrain_cdf'])
    lin_40_max = baseline.train_linear_regression(baseline.normalize_data(data_dict[ind]['Xtrain_40']), data_dict[ind]['ytrain_max'])
    linreg_40_cdf.append(lin_40_cdf)
    linreg_40_max.append(lin_40_max)

In [77]:
baseline.save_models(linreg_20_cdf, '../../datas/proc/part1/baseline_linreg_20_cdf.pkl')
baseline.save_models(linreg_20_max, '../../datas/proc/part1/baseline_linreg_20_max.pkl')
baseline.save_models(linreg_30_cdf, '../../datas/proc/part1/baseline_linreg_30_cdf.pkl')
baseline.save_models(linreg_30_max, '../../datas/proc/part1/baseline_linreg_30_max.pkl')
baseline.save_models(linreg_40_cdf, '../../datas/proc/part1/baseline_linreg_40_cdf.pkl')
baseline.save_models(linreg_40_max, '../../datas/proc/part1/baseline_linreg_40_max.pkl')

In [69]:
# Evaluate the linear regression model
performance_table_20 = (metrics.evaluate_model(linreg_20_cdf,linreg_20_max,data_dict,20))
performance_table_30 = (metrics.evaluate_model(linreg_30_cdf,linreg_30_max,data_dict,30))
performance_table_40 = (metrics.evaluate_model(linreg_40_cdf,linreg_40_max,data_dict,40))

# Create a table storing the r2 values
r2_table_20 = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in performance_table_20], 
    'train_max': [x['train_max']['R2'] for x in performance_table_20], 
    'val_cdf': [x['val_cdf']['R2'] for x in performance_table_20], 
    'val_max': [x['val_max']['R2'] for x in performance_table_20]}
    )

r2_table_30 = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in performance_table_30],
    'train_max': [x['train_max']['R2'] for x in performance_table_30],
    'val_cdf': [x['val_cdf']['R2'] for x in performance_table_30],
    'val_max': [x['val_max']['R2'] for x in performance_table_30]}
    )

r2_table_40 = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in performance_table_40],
    'train_max': [x['train_max']['R2'] for x in performance_table_40],
    'val_cdf': [x['val_cdf']['R2'] for x in performance_table_40],
    'val_max': [x['val_max']['R2'] for x in performance_table_40]}  
    )

# Create a table storing the RMSE values
rmse_table_20 = pd.DataFrame({
    'train_cdf': [x['train_cdf']['RMSE'] for x in performance_table_20], 
    'train_max': [x['train_max']['RMSE'] for x in performance_table_20], 
    'val_cdf': [x['val_cdf']['RMSE'] for x in performance_table_20], 
    'val_max': [x['val_max']['RMSE'] for x in performance_table_20]}
    )

rmse_table_30 = pd.DataFrame({
    'train_cdf': [x['train_cdf']['RMSE'] for x in performance_table_30], 
    'train_max': [x['train_max']['RMSE'] for x in performance_table_30], 
    'val_cdf': [x['val_cdf']['RMSE'] for x in performance_table_30], 
    'val_max': [x['val_max']['RMSE'] for x in performance_table_30]}
    )

rmse_table_40 = pd.DataFrame({
    'train_cdf': [x['train_cdf']['RMSE'] for x in performance_table_40], 
    'train_max': [x['train_max']['RMSE'] for x in performance_table_40], 
    'val_cdf': [x['val_cdf']['RMSE'] for x in performance_table_40], 
    'val_max': [x['val_max']['RMSE'] for x in performance_table_40]}
    )

# Create a table storing the MAE values
mae_table_20 = pd.DataFrame({
    'train_cdf': [x['train_cdf']['MAE'] for x in performance_table_20], 
    'train_max': [x['train_max']['MAE'] for x in performance_table_20], 
    'val_cdf': [x['val_cdf']['MAE'] for x in performance_table_20], 
    'val_max': [x['val_max']['MAE'] for x in performance_table_20]}
    )

mae_table_30 = pd.DataFrame({
    'train_cdf': [x['train_cdf']['MAE'] for x in performance_table_30], 
    'train_max': [x['train_max']['MAE'] for x in performance_table_30], 
    'val_cdf': [x['val_cdf']['MAE'] for x in performance_table_30], 
    'val_max': [x['val_max']['MAE'] for x in performance_table_30]}
    )

mae_table_40 = pd.DataFrame({
    'train_cdf': [x['train_cdf']['MAE'] for x in performance_table_40], 
    'train_max': [x['train_max']['MAE'] for x in performance_table_40], 
    'val_cdf': [x['val_cdf']['MAE'] for x in performance_table_40], 
    'val_max': [x['val_max']['MAE'] for x in performance_table_40]}
    )

In the low data sample regime, it is clear that all the baseline models have an overfitting problem. Take a look at the models trained with 20-most correlated variables and their PCs

In [70]:
r2_table_20

Unnamed: 0,train_cdf,train_max,val_cdf,val_max
0,0.853255,0.863827,-33.387103,-22.678594
1,0.823172,0.822258,-17.639352,-7.265268
2,0.776223,0.785311,-12.810082,-9.405961
3,0.8189,0.829108,-24.764697,-15.802331
4,0.809756,0.807754,-4.402226,-4.127699
5,0.781579,0.786449,-11.920247,-7.883279
6,0.773472,0.771539,-277.88392,-40.265957
7,0.835432,0.832517,-38.167325,-39.435998
8,0.840404,0.845829,-8.803853,-5.01306
9,0.830715,0.834893,-8.749317,-7.554756


30-most correlated variables and their PCs

In [71]:
r2_table_30

Unnamed: 0,train_cdf,train_max,val_cdf,val_max
0,1.0,1.0,-131.763391,-37.167186
1,1.0,1.0,-79.382832,-25.02359
2,1.0,1.0,-66.444307,-35.979994
3,1.0,1.0,-94.050951,-52.497085
4,1.0,1.0,-12.498509,-12.110985
5,1.0,1.0,-40.917861,-23.611177
6,1.0,1.0,-896.959729,-154.212266
7,1.0,1.0,-64.934208,-45.685874
8,1.0,1.0,-31.603537,-14.335082
9,1.0,1.0,-30.807228,-22.131249


### Random Forest

In [9]:
from tqdm import tqdm
# Train a RF model with the 20-most correlated inputs
rf_20_cdf = []
rf_20_max = []
for ind in tqdm(range(len(data_dict))):
    rf_20_cdfz = baseline.train_rf(data_dict[ind]['Xtrain_20'], data_dict[ind]['ytrain_cdf'],42)
    rf_20_maxz = baseline.train_rf(data_dict[ind]['Xtrain_20'], data_dict[ind]['ytrain_max'],42)
    rf_20_cdf.append(rf_20_cdfz)
    rf_20_max.append(rf_20_maxz) 

# Train a RF model with the 30-most correlated inputs
rf_30_cdf = []
rf_30_max = []
for ind in tqdm(range(len(data_dict))):
    rf_30_cdfz = baseline.train_rf(data_dict[ind]['Xtrain_30'], data_dict[ind]['ytrain_cdf'],42)
    rf_30_maxz = baseline.train_rf(data_dict[ind]['Xtrain_30'], data_dict[ind]['ytrain_max'],42)
    rf_30_cdf.append(rf_30_cdfz)
    rf_30_max.append(rf_30_maxz)

# Train a RF model with the 40-most correlated inputs
rf_40_cdf = []
rf_40_max = []
for ind in tqdm(range(len(data_dict))):
    rf_40_cdfz = baseline.train_rf(data_dict[ind]['Xtrain_40'], data_dict[ind]['ytrain_cdf'],42)
    rf_40_maxz = baseline.train_rf(data_dict[ind]['Xtrain_40'], data_dict[ind]['ytrain_max'],42)
    rf_40_cdf.append(rf_40_cdfz)
    rf_40_max.append(rf_40_maxz)

100%|██████████| 10/10 [03:18<00:00, 19.83s/it]
100%|██████████| 10/10 [03:41<00:00, 22.14s/it]
100%|██████████| 10/10 [04:04<00:00, 24.47s/it]


In [78]:
baseline.save_models(rf_20_cdf, '../../datas/proc/part1/baseline_rf_20_cdf.pkl')
baseline.save_models(rf_20_max, '../../datas/proc/part1/baseline_rf_20_max.pkl')
baseline.save_models(rf_30_cdf, '../../datas/proc/part1/baseline_rf_30_cdf.pkl')
baseline.save_models(rf_30_max, '../../datas/proc/part1/baseline_rf_30_max.pkl')
baseline.save_models(rf_40_cdf, '../../datas/proc/part1/baseline_rf_40_cdf.pkl')
baseline.save_models(rf_40_max, '../../datas/proc/part1/baseline_rf_40_max.pkl')

In [10]:
# Evaluate the Random Forest model
rf_performance_table_20 = (metrics.evaluate_model(rf_20_cdf,rf_20_max,data_dict,20))
rf_performance_table_30 = (metrics.evaluate_model(rf_30_cdf,rf_30_max,data_dict,30))
rf_performance_table_40 = (metrics.evaluate_model(rf_40_cdf,rf_40_max,data_dict,40))

# Create a table storing the r2 values
rf_r2_table_20 = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in rf_performance_table_20], 
    'train_max': [x['train_max']['R2'] for x in rf_performance_table_20], 
    'val_cdf': [x['val_cdf']['R2'] for x in rf_performance_table_20], 
    'val_max': [x['val_max']['R2'] for x in rf_performance_table_20]}
    )

rf_r2_table_30 = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in rf_performance_table_30],
    'train_max': [x['train_max']['R2'] for x in rf_performance_table_30],
    'val_cdf': [x['val_cdf']['R2'] for x in rf_performance_table_30],
    'val_max': [x['val_max']['R2'] for x in rf_performance_table_30]}
    )

rf_r2_table_40 = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in rf_performance_table_40],
    'train_max': [x['train_max']['R2'] for x in rf_performance_table_40],
    'val_cdf': [x['val_cdf']['R2'] for x in rf_performance_table_40],
    'val_max': [x['val_max']['R2'] for x in rf_performance_table_40]}  
    )

Sensitivity experiment: Incorporate nonlinearity by using Random Forest

In [14]:
rf_r2_table_20

Unnamed: 0,train_cdf,train_max,val_cdf,val_max
0,0.125198,0.126835,-1.77963,-0.578028
1,0.119736,0.130174,-1.209389,-0.511947
2,0.405908,0.129667,-0.165449,-0.10746
3,0.126871,0.133974,-0.586515,-0.271364
4,0.11325,0.13203,-0.106133,-0.180633
5,0.107411,0.139727,-0.619027,-0.522737
6,0.114706,0.13097,-5.104729,-1.290778
7,0.113581,0.131548,-0.202503,-0.354429
8,0.122875,0.470241,-0.132141,-0.11343
9,0.114009,0.44224,-0.235993,-0.350141


In [15]:
rf_r2_table_30

Unnamed: 0,train_cdf,train_max,val_cdf,val_max
0,0.146923,0.134286,-1.112254,-0.52323
1,0.128838,0.137175,-0.482713,-0.418805
2,0.140519,0.151934,-0.22449,-0.380391
3,0.144362,0.144407,-1.11038,-0.457091
4,0.128156,0.144961,-0.1668,-0.215663
5,0.126519,0.140871,-0.679359,-0.492123
6,0.130437,0.143217,-5.782299,-1.314509
7,0.127362,0.139184,-0.203001,-0.344268
8,0.143165,0.145314,-0.145927,-0.151131
9,0.142522,0.136762,-0.258481,-0.302503


In [13]:
rf_r2_table_40

Unnamed: 0,train_cdf,train_max,val_cdf,val_max
0,0.142035,0.131455,-0.854148,-0.492354
1,0.132687,0.134115,-0.499831,-0.43095
2,0.140864,0.156946,-0.185747,-0.310611
3,0.157041,0.141237,-1.042502,-0.415374
4,0.137127,0.145056,-0.196723,-0.227618
5,0.13941,0.143695,-0.596939,-0.455233
6,0.132352,0.141793,-5.586163,-1.238528
7,0.136406,0.144586,-0.232322,-0.372766
8,0.13638,0.146682,-0.143709,-0.134099
9,0.15195,0.138065,-0.257543,-0.306455


## Feature selection with XGBoost

In [60]:
# Find the folder name organized by seed number
seed_docs = sorted(glob.glob('../../datas/seed_*/'))

model_dict = []
# Read files in a for loop
for seed_doc in seed_docs:
    model_XGB_cdf_20 = pickle.load(open(seed_doc + 'model_xgboost/model_cdf_20.pkl', 'rb'))
    model_XGB_cdf_30 = pickle.load(open(seed_doc + 'model_xgboost/model_cdf_30.pkl', 'rb'))
    model_XGB_cdf_40 = pickle.load(open(seed_doc + 'model_xgboost/model_cdf_40.pkl', 'rb'))

    model_XGB_max_20 = pickle.load(open(seed_doc + 'model_xgboost/model_max_20.pkl', 'rb'))
    model_XGB_max_30 = pickle.load(open(seed_doc + 'model_xgboost/model_max_30.pkl', 'rb'))
    model_XGB_max_40 = pickle.load(open(seed_doc + 'model_xgboost/model_max_40.pkl', 'rb'))

    model_dict.append({
        'model_XGB_cdf_20': model_XGB_cdf_20,
        'model_XGB_cdf_30': model_XGB_cdf_30,
        'model_XGB_cdf_40': model_XGB_cdf_40,
        'model_XGB_max_20': model_XGB_max_20,
        'model_XGB_max_30': model_XGB_max_30,
        'model_XGB_max_40': model_XGB_max_40
    })

configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.



In [17]:
def filt_index_no_conv_snow(index_40,index_30,index_20,filtindex_40,filtindex_30,filtindex_20):
    filt_index_40 = []
    for obj in index_40:
        if obj in filtindex_40:
            continue
        else:
            if obj>39 and obj<85:
                filt_index_40.append(obj-2)
            elif obj>85:
                filt_index_40.append(obj-5)
            else:
                filt_index_40.append(obj)

    filt_index_30 = []
    for obj in index_30:
        if obj in filtindex_30:
            continue
        else:
            if obj>39:
                filt_index_30.append(obj-2)
            else:
                filt_index_30.append(obj)

    filt_index_20 = []
    for obj in index_20:
        if obj in filtindex_20:
            continue
        else:
            if obj>39:
                filt_index_20.append(obj-2)
            else:
                filt_index_20.append(obj)
    return filt_index_20, filt_index_30, filt_index_40

In [62]:
# Find the feature importance of the RF model
feature_importance_20_cdf_xgb = []
feature_importance_30_cdf_xgb = []
feature_importance_40_cdf_xgb = []
feature_importance_20_max_xgb = []
feature_importance_30_max_xgb = []
feature_importance_40_max_xgb = []

for ind in range(len(model_dict)):
    feature_importance_20_cdf_xgb.append(model_dict[ind]['model_XGB_cdf_20'].feature_importances_)
    feature_importance_30_cdf_xgb.append(model_dict[ind]['model_XGB_cdf_30'].feature_importances_)
    feature_importance_40_cdf_xgb.append(model_dict[ind]['model_XGB_cdf_40'].feature_importances_)
    feature_importance_20_max_xgb.append(model_dict[ind]['model_XGB_max_20'].feature_importances_)
    feature_importance_30_max_xgb.append(model_dict[ind]['model_XGB_max_30'].feature_importances_)
    feature_importance_40_max_xgb.append(model_dict[ind]['model_XGB_max_40'].feature_importances_)

# Find index where feature importance is greater than 0
index_20_cdf_xgb = [[ind for ind, x in enumerate(feature_importance_20_cdf_xgb[i]) if x > 0] for i in range(len(feature_importance_20_cdf_xgb))]
index_30_cdf_xgb = [[ind for ind, x in enumerate(feature_importance_30_cdf_xgb[i]) if x > 0] for i in range(len(feature_importance_30_cdf_xgb))]
index_40_cdf_xgb = [[ind for ind, x in enumerate(feature_importance_40_cdf_xgb[i]) if x > 0] for i in range(len(feature_importance_40_cdf_xgb))]
index_20_max_xgb = [[ind for ind, x in enumerate(feature_importance_20_max_xgb[i]) if x > 0] for i in range(len(feature_importance_20_max_xgb))]
index_30_max_xgb = [[ind for ind, x in enumerate(feature_importance_30_max_xgb[i]) if x > 0] for i in range(len(feature_importance_30_max_xgb))]
index_40_max_xgb = [[ind for ind, x in enumerate(feature_importance_40_max_xgb[i]) if x > 0] for i in range(len(feature_importance_40_max_xgb))]

# Process the index to remove the columns associated with convective_precipitation and large_scale_snowfall
index_20_filt_cdf_xgb = [filt_index_no_conv_snow(index_40_cdf_xgb[i],index_30_cdf_xgb[i],index_20_cdf_xgb[i],filtindex_40, filtindex_30, filtindex_20)[0] for i in range(len(index_20_cdf_xgb))]
index_30_filt_cdf_xgb = [filt_index_no_conv_snow(index_40_cdf_xgb[i],index_30_cdf_xgb[i],index_20_cdf_xgb[i],filtindex_40, filtindex_30, filtindex_20)[1] for i in range(len(index_30_cdf_xgb))]
index_40_filt_cdf_xgb = [filt_index_no_conv_snow(index_40_cdf_xgb[i],index_30_cdf_xgb[i],index_20_cdf_xgb[i],filtindex_40, filtindex_30, filtindex_20)[2] for i in range(len(index_40_cdf_xgb))]
index_20_filt_max_xgb = [filt_index_no_conv_snow(index_40_max_xgb[i],index_30_max_xgb[i],index_20_max_xgb[i],filtindex_40, filtindex_30, filtindex_20)[0] for i in range(len(index_20_max_xgb))]
index_30_filt_max_xgb = [filt_index_no_conv_snow(index_40_max_xgb[i],index_30_max_xgb[i],index_20_max_xgb[i],filtindex_40, filtindex_30, filtindex_20)[1] for i in range(len(index_30_max_xgb))]
index_40_filt_max_xgb = [filt_index_no_conv_snow(index_40_max_xgb[i],index_30_max_xgb[i],index_20_max_xgb[i],filtindex_40, filtindex_30, filtindex_20)[2] for i in range(len(index_40_max_xgb))]
            
# Filter the input data with the feature importance
data_dict_filt_cdf_xgb = [baseline.filt_with_feature_importance(index_20_filt_cdf_xgb[ind], index_30_filt_cdf_xgb[ind], index_40_filt_cdf_xgb[ind], data_dict[ind]) for ind in range(len(data_dict))]
data_dict_filt_max_xgb = [baseline.filt_with_feature_importance(index_20_filt_max_xgb[ind], index_30_filt_max_xgb[ind], index_40_filt_max_xgb[ind], data_dict[ind]) for ind in range(len(data_dict))]  

### Linear Regression

In [63]:
# Train a linear regression model to predict CDF with the filtered data 
lin_20_cdf_filt_xgb = [baseline.train_linear_regression(baseline.normalize_data(data_dict_filt_cdf_xgb[i]['Xtrain_20']), data_dict_filt_cdf_xgb[i]['ytrain_cdf']) for i in range(len(data_dict_filt_cdf_xgb))]
lin_30_cdf_filt_xgb = [baseline.train_linear_regression(baseline.normalize_data(data_dict_filt_cdf_xgb[i]['Xtrain_30']), data_dict_filt_cdf_xgb[i]['ytrain_cdf']) for i in range(len(data_dict_filt_cdf_xgb))]
lin_40_cdf_filt_xgb = [baseline.train_linear_regression(baseline.normalize_data(data_dict_filt_cdf_xgb[i]['Xtrain_40']), data_dict_filt_cdf_xgb[i]['ytrain_cdf']) for i in range(len(data_dict_filt_cdf_xgb))]

# Train a linear regression model to predict max with the filtered data
lin_20_max_filt_xgb = [baseline.train_linear_regression(baseline.normalize_data(data_dict_filt_max_xgb[i]['Xtrain_20']), data_dict_filt_max_xgb[i]['ytrain_max']) for i in range(len(data_dict_filt_max_xgb))]
lin_30_max_filt_xgb = [baseline.train_linear_regression(baseline.normalize_data(data_dict_filt_max_xgb[i]['Xtrain_30']), data_dict_filt_max_xgb[i]['ytrain_max']) for i in range(len(data_dict_filt_max_xgb))]
lin_40_max_filt_xgb = [baseline.train_linear_regression(baseline.normalize_data(data_dict_filt_max_xgb[i]['Xtrain_40']), data_dict_filt_max_xgb[i]['ytrain_max']) for i in range(len(data_dict_filt_max_xgb))]

# Evaluate the model
performance_table_20_filt_xgb_cdf = metrics.evaluate_cdfmodel(lin_20_cdf_filt_xgb, data_dict_filt_cdf_xgb, 20)
performance_table_30_filt_xgb_cdf = metrics.evaluate_cdfmodel(lin_30_cdf_filt_xgb, data_dict_filt_cdf_xgb, 30)
performance_table_40_filt_xgb_cdf = metrics.evaluate_cdfmodel(lin_40_cdf_filt_xgb, data_dict_filt_cdf_xgb, 40)
performance_table_20_filt_xgb_max = metrics.evaluate_maxmodel(lin_20_max_filt_xgb, data_dict_filt_max_xgb, 20)
performance_table_30_filt_xgb_max = metrics.evaluate_maxmodel(lin_30_max_filt_xgb, data_dict_filt_max_xgb, 30)
performance_table_40_filt_xgb_max = metrics.evaluate_maxmodel(lin_40_max_filt_xgb, data_dict_filt_max_xgb, 40)

In [79]:
baseline.save_models(lin_20_cdf_filt_xgb, '../../datas/proc/part1/xgb_linreg_20_cdf.pkl')
baseline.save_models(lin_30_cdf_filt_xgb, '../../datas/proc/part1/xgb_linreg_30_cdf.pkl')
baseline.save_models(lin_40_cdf_filt_xgb, '../../datas/proc/part1/xgb_linreg_40_cdf.pkl')
baseline.save_models(lin_20_max_filt_xgb, '../../datas/proc/part1/xgb_linreg_20_max.pkl')
baseline.save_models(lin_30_max_filt_xgb, '../../datas/proc/part1/xgb_linreg_30_max.pkl')
baseline.save_models(lin_40_max_filt_xgb, '../../datas/proc/part1/xgb_linreg_40_max.pkl')

In [72]:
# Create a table storing the r2 values
r2_table_20_xgb = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in performance_table_20_filt_xgb_cdf], 
    'train_max': [x['train_max']['R2'] for x in performance_table_20_filt_xgb_max], 
    'val_cdf': [x['val_cdf']['R2'] for x in performance_table_20_filt_xgb_cdf], 
    'val_max': [x['val_max']['R2'] for x in performance_table_20_filt_xgb_max]}
    )

r2_table_30_xgb = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in performance_table_30_filt_xgb_cdf], 
    'train_max': [x['train_max']['R2'] for x in performance_table_30_filt_xgb_max], 
    'val_cdf': [x['val_cdf']['R2'] for x in performance_table_30_filt_xgb_cdf], 
    'val_max': [x['val_max']['R2'] for x in performance_table_30_filt_xgb_max]}
    )

r2_table_40_xgb = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in performance_table_40_filt_xgb_cdf], 
    'train_max': [x['train_max']['R2'] for x in performance_table_40_filt_xgb_max], 
    'val_cdf': [x['val_cdf']['R2'] for x in performance_table_40_filt_xgb_cdf], 
    'val_max': [x['val_max']['R2'] for x in performance_table_40_filt_xgb_max]}
    )

In [73]:
r2_table_20_xgb

Unnamed: 0,train_cdf,train_max,val_cdf,val_max
0,0.648682,0.561885,-10.415701,-1.908187
1,0.518805,0.541638,-4.481353,-1.914102
2,0.580669,0.628529,-4.958591,-4.683508
3,0.618891,0.601606,-10.653628,-7.556399
4,0.473063,0.674875,-1.571362,-2.709873
5,0.523151,0.520143,-1.757676,-0.991359
6,0.456224,0.653507,-14.802422,-27.827247
7,0.516895,0.63573,-4.907903,-7.473435
8,0.474423,0.481364,-2.572206,-1.455373
9,0.57746,0.672701,-1.877107,-1.549009


In [74]:
r2_table_30_xgb

Unnamed: 0,train_cdf,train_max,val_cdf,val_max
0,1.0,1.0,-204.200052,-57.889772
1,1.0,0.947062,-51.635736,-58.761012
2,0.774612,0.766079,-6.174514,-9.478766
3,0.495856,0.907496,-11.109979,-59.985442
4,1.0,0.834303,-12.760374,-26.233897
5,0.934207,0.782307,-59.700021,-14.012055
6,0.747113,0.928831,-371.024797,-210.289633
7,0.889541,0.966325,-31.067668,-116.799304
8,0.563195,0.846428,-2.851424,-10.126674
9,0.809239,0.976273,-6.639544,-218.558401


### Random Forest

In [75]:
# Train a RF model to predict CDF with the filtered data 
rf_20_cdf_filt_xgb = [baseline.train_rf(data_dict_filt_cdf_xgb[i]['Xtrain_20'], data_dict_filt_cdf_xgb[i]['ytrain_cdf'], 42) for i in tqdm(range(len(data_dict_filt_cdf_xgb)))]
rf_30_cdf_filt_xgb = [baseline.train_rf(data_dict_filt_cdf_xgb[i]['Xtrain_30'], data_dict_filt_cdf_xgb[i]['ytrain_cdf'], 42) for i in tqdm(range(len(data_dict_filt_cdf_xgb)))]
rf_40_cdf_filt_xgb = [baseline.train_rf(data_dict_filt_cdf_xgb[i]['Xtrain_40'], data_dict_filt_cdf_xgb[i]['ytrain_cdf'], 42) for i in tqdm(range(len(data_dict_filt_cdf_xgb)))]

# Train a RF model to predict max with the filtered data
rf_20_max_filt_xgb = [baseline.train_rf(data_dict_filt_max_xgb[i]['Xtrain_20'], data_dict_filt_max_xgb[i]['ytrain_max'], 42) for i in tqdm(range(len(data_dict_filt_max_xgb)))]
rf_30_max_filt_xgb = [baseline.train_rf(data_dict_filt_max_xgb[i]['Xtrain_30'], data_dict_filt_max_xgb[i]['ytrain_max'], 42) for i in tqdm(range(len(data_dict_filt_max_xgb)))]
rf_40_max_filt_xgb = [baseline.train_rf(data_dict_filt_max_xgb[i]['Xtrain_40'], data_dict_filt_max_xgb[i]['ytrain_max'], 42) for i in tqdm(range(len(data_dict_filt_max_xgb)))]

100%|██████████| 10/10 [01:35<00:00,  9.59s/it]
100%|██████████| 10/10 [01:39<00:00, 10.00s/it]
100%|██████████| 10/10 [01:54<00:00, 11.43s/it]
100%|██████████| 10/10 [01:36<00:00,  9.64s/it]
100%|██████████| 10/10 [01:44<00:00, 10.41s/it]
100%|██████████| 10/10 [01:39<00:00,  9.98s/it]


In [80]:
baseline.save_models(rf_20_cdf_filt_xgb, '../../datas/proc/part1/xgb_rf_20_cdf.pkl')
baseline.save_models(rf_30_cdf_filt_xgb, '../../datas/proc/part1/xgb_rf_30_cdf.pkl')
baseline.save_models(rf_40_cdf_filt_xgb, '../../datas/proc/part1/xgb_rf_40_cdf.pkl')
baseline.save_models(rf_20_max_filt_xgb, '../../datas/proc/part1/xgb_rf_20_max.pkl')
baseline.save_models(rf_30_max_filt_xgb, '../../datas/proc/part1/xgb_rf_30_max.pkl')
baseline.save_models(rf_40_max_filt_xgb, '../../datas/proc/part1/xgb_rf_40_max.pkl')

In [83]:
# Evaluate the model
rf_performance_table_20_filt_xgb_cdf = metrics.evaluate_cdfmodel(rf_20_cdf_filt_xgb, data_dict_filt_cdf_xgb, 20)
rf_performance_table_30_filt_xgb_cdf = metrics.evaluate_cdfmodel(rf_30_cdf_filt_xgb, data_dict_filt_cdf_xgb, 30)
rf_performance_table_40_filt_xgb_cdf = metrics.evaluate_cdfmodel(rf_40_cdf_filt_xgb, data_dict_filt_cdf_xgb, 40)
rf_performance_table_20_filt_xgb_max = metrics.evaluate_maxmodel(rf_20_max_filt_xgb, data_dict_filt_max_xgb, 20)
rf_performance_table_30_filt_xgb_max = metrics.evaluate_maxmodel(rf_30_max_filt_xgb, data_dict_filt_max_xgb, 30)
rf_performance_table_40_filt_xgb_max = metrics.evaluate_maxmodel(rf_40_max_filt_xgb, data_dict_filt_max_xgb, 40)

In [84]:
# Create a table storing the r2 values
rf_r2_table_20_xgb = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in rf_performance_table_20_filt_xgb_cdf], 
    'train_max': [x['train_max']['R2'] for x in rf_performance_table_20_filt_xgb_max], 
    'val_cdf': [x['val_cdf']['R2'] for x in rf_performance_table_20_filt_xgb_cdf], 
    'val_max': [x['val_max']['R2'] for x in rf_performance_table_20_filt_xgb_max]}
    )

rf_r2_table_30_xgb = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in rf_performance_table_30_filt_xgb_cdf], 
    'train_max': [x['train_max']['R2'] for x in rf_performance_table_30_filt_xgb_max], 
    'val_cdf': [x['val_cdf']['R2'] for x in rf_performance_table_30_filt_xgb_cdf], 
    'val_max': [x['val_max']['R2'] for x in rf_performance_table_30_filt_xgb_max]}
    )

rf_r2_table_40_xgb = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in rf_performance_table_40_filt_xgb_cdf], 
    'train_max': [x['train_max']['R2'] for x in rf_performance_table_40_filt_xgb_max], 
    'val_cdf': [x['val_cdf']['R2'] for x in rf_performance_table_40_filt_xgb_cdf], 
    'val_max': [x['val_max']['R2'] for x in rf_performance_table_40_filt_xgb_max]}
    )

In [85]:
rf_r2_table_20_xgb

Unnamed: 0,train_cdf,train_max,val_cdf,val_max
0,0.125087,0.125902,-1.77963,-0.616098
1,0.113101,0.129722,-0.632175,-0.506543
2,0.410598,0.429612,-0.077461,-0.201473
3,0.127072,0.134389,-0.586728,-0.31297
4,0.11371,0.131753,-0.100798,-0.179701
5,0.109651,0.398565,-0.596304,-0.656428
6,0.405759,0.13097,-6.380835,-1.316224
7,0.411988,0.131396,-0.313987,-0.352116
8,0.125503,0.646834,-0.17632,-0.1643
9,0.114009,0.442911,-0.235993,-0.353471


In [86]:
rf_r2_table_30_xgb

Unnamed: 0,train_cdf,train_max,val_cdf,val_max
0,0.129411,0.133704,-0.855237,-0.520839
1,0.129118,0.130844,-0.515697,-0.429697
2,0.123242,0.137737,-0.038279,-0.071429
3,0.142018,0.143031,-1.244302,-0.419293
4,0.130531,0.147129,-0.166974,-0.240019
5,0.397227,0.142668,-0.829409,-0.527886
6,0.119049,0.141056,-5.737155,-1.304494
7,0.124548,0.137158,-0.173496,-0.332459
8,0.132369,0.145025,-0.136752,-0.163256
9,0.13638,0.132413,-0.236399,-0.294842


In [87]:
rf_r2_table_40_xgb

Unnamed: 0,train_cdf,train_max,val_cdf,val_max
0,0.12852,0.132253,-0.84952,-0.491262
1,0.129084,0.134414,-0.514496,-0.423288
2,0.123965,0.1394,-0.051073,-0.06254
3,0.149319,0.145457,-0.99331,-0.436676
4,0.13828,0.491059,-0.183483,-0.304093
5,0.140242,0.1454,-0.599211,-0.47725
6,0.148855,0.139515,-8.472444,-1.241013
7,0.129542,0.461183,-0.245354,-0.623167
8,0.127796,0.146379,-0.147709,-0.1401
9,0.15195,0.137668,-0.257543,-0.308336


## Feature selection with Random Forest

In [35]:
# Find the folder name organized by seed number
seed_docs = sorted(glob.glob('../../datas/seed_*/'))

model_dict = []
# Read files in a for loop
for seed_doc in seed_docs:
    model_RF_cdf_20 = pickle.load(open(seed_doc + 'model_random_forest/model_cdf_20.pkl', 'rb'))
    model_RF_cdf_30 = pickle.load(open(seed_doc + 'model_random_forest/model_cdf_30.pkl', 'rb'))
    model_RF_cdf_40 = pickle.load(open(seed_doc + 'model_random_forest/model_cdf_40.pkl', 'rb'))

    model_RF_max_20 = pickle.load(open(seed_doc + 'model_random_forest/model_max_20.pkl', 'rb'))
    model_RF_max_30 = pickle.load(open(seed_doc + 'model_random_forest/model_max_30.pkl', 'rb'))
    model_RF_max_40 = pickle.load(open(seed_doc + 'model_random_forest/model_max_40.pkl', 'rb'))

    model_dict.append({
        'model_RF_cdf_20': model_RF_cdf_20,
        'model_RF_cdf_30': model_RF_cdf_30,
        'model_RF_cdf_40': model_RF_cdf_40,
        'model_RF_max_20': model_RF_max_20,
        'model_RF_max_30': model_RF_max_30,
        'model_RF_max_40': model_RF_max_40
    })

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [37]:
# Find the feature importance of the RF model
feature_importance_20_cdf = []
feature_importance_30_cdf = []
feature_importance_40_cdf = []
feature_importance_20_max = []
feature_importance_30_max = []
feature_importance_40_max = []

for ind in range(len(model_dict)):
    feature_importance_20_cdf.append(model_dict[ind]['model_RF_cdf_20'].feature_importances_)
    feature_importance_30_cdf.append(model_dict[ind]['model_RF_cdf_30'].feature_importances_)
    feature_importance_40_cdf.append(model_dict[ind]['model_RF_cdf_40'].feature_importances_)
    feature_importance_20_max.append(model_dict[ind]['model_RF_max_20'].feature_importances_)
    feature_importance_30_max.append(model_dict[ind]['model_RF_max_30'].feature_importances_)
    feature_importance_40_max.append(model_dict[ind]['model_RF_max_40'].feature_importances_)

# Find index where feature importance is greater than 0
index_20_cdf = [[ind for ind, x in enumerate(feature_importance_20_cdf[i]) if x > 0] for i in range(len(feature_importance_20_cdf))]
index_30_cdf = [[ind for ind, x in enumerate(feature_importance_30_cdf[i]) if x > 0] for i in range(len(feature_importance_30_cdf))]
index_40_cdf = [[ind for ind, x in enumerate(feature_importance_40_cdf[i]) if x > 0] for i in range(len(feature_importance_40_cdf))]
index_20_max = [[ind for ind, x in enumerate(feature_importance_20_max[i]) if x > 0] for i in range(len(feature_importance_20_max))]
index_30_max = [[ind for ind, x in enumerate(feature_importance_30_max[i]) if x > 0] for i in range(len(feature_importance_30_max))]
index_40_max = [[ind for ind, x in enumerate(feature_importance_40_max[i]) if x > 0] for i in range(len(feature_importance_40_max))]

# Process the index to remove the columns associated with convective_precipitation and large_scale_snowfall
index_20_filt_cdf = [filt_index_no_conv_snow(index_40_cdf[i],index_30_cdf[i],index_20_cdf[i],filtindex_40, filtindex_30, filtindex_20)[0] for i in range(len(index_20_cdf))]
index_30_filt_cdf = [filt_index_no_conv_snow(index_40_cdf[i],index_30_cdf[i],index_20_cdf[i],filtindex_40, filtindex_30, filtindex_20)[1] for i in range(len(index_30_cdf))]
index_40_filt_cdf = [filt_index_no_conv_snow(index_40_cdf[i],index_30_cdf[i],index_20_cdf[i],filtindex_40, filtindex_30, filtindex_20)[2] for i in range(len(index_40_cdf))]
index_20_filt_max = [filt_index_no_conv_snow(index_40_max[i],index_30_max[i],index_20_max[i],filtindex_40, filtindex_30, filtindex_20)[0] for i in range(len(index_20_max))]
index_30_filt_max = [filt_index_no_conv_snow(index_40_max[i],index_30_max[i],index_20_max[i],filtindex_40, filtindex_30, filtindex_20)[1] for i in range(len(index_30_max))]
index_40_filt_max = [filt_index_no_conv_snow(index_40_max[i],index_30_max[i],index_20_max[i],filtindex_40, filtindex_30, filtindex_20)[2] for i in range(len(index_40_max))]
            
# Filter the input data with the feature importance
data_dict_filt_cdf_rf = [baseline.filt_with_feature_importance(index_20_filt_cdf[ind], index_30_filt_cdf[ind], index_40_filt_cdf[ind], data_dict[ind]) for ind in range(len(data_dict))]
data_dict_filt_max_rf = [baseline.filt_with_feature_importance(index_20_filt_max[ind], index_30_filt_max[ind], index_40_filt_max[ind], data_dict[ind]) for ind in range(len(data_dict))]  

### MLR

In [49]:
# Train a linear regression model to predict CDF with the filtered data 
lin_20_cdf_filt_rf = [baseline.train_linear_regression(baseline.normalize_data(data_dict_filt_cdf_rf[i]['Xtrain_20']), data_dict_filt_cdf_rf[i]['ytrain_cdf']) for i in range(len(data_dict_filt_cdf_rf))]
lin_30_cdf_filt_rf = [baseline.train_linear_regression(baseline.normalize_data(data_dict_filt_cdf_rf[i]['Xtrain_30']), data_dict_filt_cdf_rf[i]['ytrain_cdf']) for i in range(len(data_dict_filt_cdf_rf))]
lin_40_cdf_filt_rf = [baseline.train_linear_regression(baseline.normalize_data(data_dict_filt_cdf_rf[i]['Xtrain_40']), data_dict_filt_cdf_rf[i]['ytrain_cdf']) for i in range(len(data_dict_filt_cdf_rf))]

# Train a linear regression model to predict max with the filtered data
lin_20_max_filt_rf = [baseline.train_linear_regression(baseline.normalize_data(data_dict_filt_max_rf[i]['Xtrain_20']), data_dict_filt_max_rf[i]['ytrain_max']) for i in range(len(data_dict_filt_max_rf))]
lin_30_max_filt_rf = [baseline.train_linear_regression(baseline.normalize_data(data_dict_filt_max_rf[i]['Xtrain_30']), data_dict_filt_max_rf[i]['ytrain_max']) for i in range(len(data_dict_filt_max_rf))]
lin_40_max_filt_rf = [baseline.train_linear_regression(baseline.normalize_data(data_dict_filt_max_rf[i]['Xtrain_40']), data_dict_filt_max_rf[i]['ytrain_max']) for i in range(len(data_dict_filt_max_rf))]

# Evaluate the model
performance_table_20_filt_rf_cdf = metrics.evaluate_cdfmodel(lin_20_cdf_filt_rf, data_dict_filt_cdf_rf, 20)
performance_table_30_filt_rf_cdf = metrics.evaluate_cdfmodel(lin_30_cdf_filt_rf, data_dict_filt_cdf_rf, 30)
performance_table_40_filt_rf_cdf = metrics.evaluate_cdfmodel(lin_40_cdf_filt_rf, data_dict_filt_cdf_rf, 40)
performance_table_20_filt_rf_max = metrics.evaluate_maxmodel(lin_20_max_filt_rf, data_dict_filt_max_rf, 20)
performance_table_30_filt_rf_max = metrics.evaluate_maxmodel(lin_30_max_filt_rf, data_dict_filt_max_rf, 30)
performance_table_40_filt_rf_max = metrics.evaluate_maxmodel(lin_40_max_filt_rf, data_dict_filt_max_rf, 40)

In [81]:
baseline.save_models(lin_20_cdf_filt_rf, '../../datas/proc/part1/rf_linreg_20_cdf.pkl')
baseline.save_models(lin_30_cdf_filt_rf, '../../datas/proc/part1/rf_linreg_30_cdf.pkl')
baseline.save_models(lin_40_cdf_filt_rf, '../../datas/proc/part1/rf_linreg_40_cdf.pkl')
baseline.save_models(lin_20_max_filt_rf, '../../datas/proc/part1/rf_linreg_20_max.pkl')
baseline.save_models(lin_30_max_filt_rf, '../../datas/proc/part1/rf_linreg_30_max.pkl')
baseline.save_models(lin_40_max_filt_rf, '../../datas/proc/part1/rf_linreg_40_max.pkl')

In [50]:
# Create a table storing the r2 values
r2_table_20_rf = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in performance_table_20_filt_rf_cdf], 
    'train_max': [x['train_max']['R2'] for x in performance_table_20_filt_rf_max], 
    'val_cdf': [x['val_cdf']['R2'] for x in performance_table_20_filt_rf_cdf], 
    'val_max': [x['val_max']['R2'] for x in performance_table_20_filt_rf_max]}
    )

r2_table_30_rf = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in performance_table_30_filt_rf_cdf], 
    'train_max': [x['train_max']['R2'] for x in performance_table_30_filt_rf_max], 
    'val_cdf': [x['val_cdf']['R2'] for x in performance_table_30_filt_rf_cdf], 
    'val_max': [x['val_max']['R2'] for x in performance_table_30_filt_rf_max]}
    )

r2_table_40_rf = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in performance_table_40_filt_rf_cdf], 
    'train_max': [x['train_max']['R2'] for x in performance_table_40_filt_rf_max], 
    'val_cdf': [x['val_cdf']['R2'] for x in performance_table_40_filt_rf_cdf], 
    'val_max': [x['val_max']['R2'] for x in performance_table_40_filt_rf_max]}
)

In [51]:
r2_table_20_rf

Unnamed: 0,train_cdf,train_max,val_cdf,val_max
0,0.618843,0.8245,-7.832996,-11.222197
1,0.559682,0.704008,-3.673632,-3.605653
2,0.157377,0.439014,-0.560989,-2.666519
3,0.800754,0.216729,-19.809484,-2.8789
4,0.789869,0.352543,-3.471427,-0.853664
5,0.074095,0.638983,-0.70239,-1.664192
6,0.397401,0.666194,-56.645389,-24.885813
7,0.257507,0.662528,-1.34655,-4.449953
8,0.495778,0.613392,-1.510879,-1.907528
9,0.580029,0.686712,-1.699891,-1.625588


In [52]:
r2_table_30_rf

Unnamed: 0,train_cdf,train_max,val_cdf,val_max
0,0.659693,0.137842,-12.026547,-0.707004
1,0.445199,0.581874,-1.700332,-2.367454
2,0.84892,1.0,-15.673977,-24.467582
3,0.149253,1.0,-2.803969,-51.317773
4,0.704242,0.396712,-6.211551,-1.661787
5,0.878077,1.0,-68.40943,-26.699428
6,0.792232,0.496631,-240.686985,-34.866584
7,0.515839,0.667144,-3.813345,-8.077254
8,0.656628,0.52426,-4.160967,-1.840093
9,0.168234,0.540634,-0.734504,-3.532557


### Random Forest

In [53]:
# Train a RF model to predict CDF with the filtered data 
rf_20_cdf_filt_rf = [baseline.train_rf(data_dict_filt_cdf_rf[i]['Xtrain_20'], data_dict_filt_cdf_rf[i]['ytrain_cdf'], 42) for i in tqdm(range(len(data_dict_filt_cdf_rf)))]
rf_30_cdf_filt_rf = [baseline.train_rf(data_dict_filt_cdf_rf[i]['Xtrain_30'], data_dict_filt_cdf_rf[i]['ytrain_cdf'], 42) for i in tqdm(range(len(data_dict_filt_cdf_rf)))]
rf_40_cdf_filt_rf = [baseline.train_rf(data_dict_filt_cdf_rf[i]['Xtrain_40'], data_dict_filt_cdf_rf[i]['ytrain_cdf'], 42) for i in tqdm(range(len(data_dict_filt_cdf_rf)))]

# Train a RF model to predict max with the filtered data
rf_20_max_filt_rf = [baseline.train_rf(data_dict_filt_max_rf[i]['Xtrain_20'], data_dict_filt_max_rf[i]['ytrain_max'], 42) for i in tqdm(range(len(data_dict_filt_max_rf)))]
rf_30_max_filt_rf = [baseline.train_rf(data_dict_filt_max_rf[i]['Xtrain_30'], data_dict_filt_max_rf[i]['ytrain_max'], 42) for i in tqdm(range(len(data_dict_filt_max_rf)))]
rf_40_max_filt_rf = [baseline.train_rf(data_dict_filt_max_rf[i]['Xtrain_40'], data_dict_filt_max_rf[i]['ytrain_max'], 42) for i in tqdm(range(len(data_dict_filt_max_rf)))]

100%|██████████| 10/10 [01:33<00:00,  9.36s/it]
100%|██████████| 10/10 [01:32<00:00,  9.23s/it]
100%|██████████| 10/10 [01:35<00:00,  9.51s/it]
100%|██████████| 10/10 [01:31<00:00,  9.14s/it]
100%|██████████| 10/10 [01:34<00:00,  9.47s/it]
100%|██████████| 10/10 [01:42<00:00, 10.20s/it]


In [82]:
baseline.save_models(rf_20_cdf_filt_rf, '../../datas/proc/part1/rf_rf_20_cdf.pkl')
baseline.save_models(rf_30_cdf_filt_rf, '../../datas/proc/part1/rf_rf_30_cdf.pkl')
baseline.save_models(rf_40_cdf_filt_rf, '../../datas/proc/part1/rf_rf_40_cdf.pkl')
baseline.save_models(rf_20_max_filt_rf, '../../datas/proc/part1/rf_rf_20_max.pkl')
baseline.save_models(rf_30_max_filt_rf, '../../datas/proc/part1/rf_rf_30_max.pkl')
baseline.save_models(rf_40_max_filt_rf, '../../datas/proc/part1/rf_rf_40_max.pkl')

In [54]:
# Evaluate the model
rf_performance_table_20_filt_rf_cdf = metrics.evaluate_cdfmodel(rf_20_cdf_filt_rf, data_dict_filt_cdf_rf, 20)
rf_performance_table_30_filt_rf_cdf = metrics.evaluate_cdfmodel(rf_30_cdf_filt_rf, data_dict_filt_cdf_rf, 30)
rf_performance_table_40_filt_rf_cdf = metrics.evaluate_cdfmodel(rf_40_cdf_filt_rf, data_dict_filt_cdf_rf, 40)
rf_performance_table_20_filt_rf_max = metrics.evaluate_maxmodel(rf_20_max_filt_rf, data_dict_filt_max_rf, 20)
rf_performance_table_30_filt_rf_max = metrics.evaluate_maxmodel(rf_30_max_filt_rf, data_dict_filt_max_rf, 30)
rf_performance_table_40_filt_rf_max = metrics.evaluate_maxmodel(rf_40_max_filt_rf, data_dict_filt_max_rf, 40)

In [55]:
# Create a table storing the r2 values
rf_r2_table_20_rf = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in rf_performance_table_20_filt_rf_cdf], 
    'train_max': [x['train_max']['R2'] for x in rf_performance_table_20_filt_rf_max], 
    'val_cdf': [x['val_cdf']['R2'] for x in rf_performance_table_20_filt_rf_cdf], 
    'val_max': [x['val_max']['R2'] for x in rf_performance_table_20_filt_rf_max]}
    )

rf_r2_table_30_rf = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in rf_performance_table_30_filt_rf_cdf], 
    'train_max': [x['train_max']['R2'] for x in rf_performance_table_30_filt_rf_max], 
    'val_cdf': [x['val_cdf']['R2'] for x in rf_performance_table_30_filt_rf_cdf], 
    'val_max': [x['val_max']['R2'] for x in rf_performance_table_30_filt_rf_max]}
    )

rf_r2_table_40_rf = pd.DataFrame({
    'train_cdf': [x['train_cdf']['R2'] for x in rf_performance_table_40_filt_rf_cdf], 
    'train_max': [x['train_max']['R2'] for x in rf_performance_table_40_filt_rf_max], 
    'val_cdf': [x['val_cdf']['R2'] for x in rf_performance_table_40_filt_rf_cdf], 
    'val_max': [x['val_max']['R2'] for x in rf_performance_table_40_filt_rf_max]}
)

In [56]:
rf_r2_table_20_rf

Unnamed: 0,train_cdf,train_max,val_cdf,val_max
0,0.117918,0.126835,-1.397651,-0.578028
1,0.119025,0.129944,-1.215597,-0.511947
2,0.097494,0.423967,-0.034402,-0.119711
3,0.126871,0.469533,-0.586515,-0.700799
4,0.11325,0.476016,-0.106133,-0.207587
5,0.267743,0.139279,-1.093989,-0.507585
6,0.115335,0.13097,-5.988968,-1.300984
7,0.331804,0.131472,-0.50454,-0.356864
8,0.126622,0.856539,-0.178389,-0.105939
9,0.356031,0.44417,-0.259318,-0.352756


In [57]:
rf_r2_table_30_rf

Unnamed: 0,train_cdf,train_max,val_cdf,val_max
0,0.124868,0.120604,-0.860921,-0.490594
1,0.123478,0.130951,-0.532654,-0.43998
2,0.123921,0.15305,-0.045057,-0.421206
3,0.133294,0.144425,-0.731399,-0.431167
4,0.131636,0.440998,-0.181343,-0.308409
5,0.396446,0.142239,-0.837578,-0.494209
6,0.126821,0.481909,-5.638997,-1.836701
7,0.121518,0.137936,-0.201422,-0.380448
8,0.146811,0.139767,-0.128154,-0.157458
9,0.119262,0.444823,-0.257231,-0.335185


In [58]:
rf_r2_table_40_rf

Unnamed: 0,train_cdf,train_max,val_cdf,val_max
0,0.138744,0.124924,-0.847556,-0.505506
1,0.130619,0.122338,-0.515902,-0.435724
2,0.111039,0.132124,-0.057751,-0.023905
3,0.146905,0.14287,-1.056102,-0.437134
4,0.136353,0.630416,-0.195119,-0.237626
5,0.109924,0.459964,-0.467412,-0.510662
6,0.319648,0.144168,-6.62355,-1.223588
7,0.121313,0.144521,-0.179727,-0.368812
8,0.138001,0.145976,-0.187852,-0.132067
9,0.148486,0.134024,-0.248743,-0.304408


## Analysis

In [117]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
bestmodel_20 = rf_20_cdf_filt[rf_r2_table_20['val_cdf'].argmax()]

scaler = StandardScaler()
scaler.fit(data_dict_filt[rf_r2_table_20['val_cdf'].argmax()][f'Xtrain_{int(20)}'])
X_train_scaled = scaler.transform(data_dict_filt[rf_r2_table_20['val_cdf'].argmax()][f'Xtrain_{int(20)}'])
X_val_scaled = scaler.transform(data_dict_filt[rf_r2_table_20['val_cdf'].argmax()][f'Xvalid_{int(20)}'])

In [126]:
[r2_score(data_dict_filt[rf_r2_table_20['val_cdf'].argmax()]['yvalid_cdf'].iloc[:,i],bestmodel_20.predict(X_val_scaled)[:,i]) for i in range(15)]

[-0.00451929000559681,
 0.042471714600479604,
 -0.07192502851009852,
 -0.1333637002480632,
 0.15992191387868115,
 -0.267926220902712,
 -0.09920916065126906,
 -0.1355024969073333,
 -0.10479935682070152,
 -0.2036641568787334,
 0.06918699022306907,
 -0.06844787309190736,
 0.015830570915514763,
 -0.4003989947203872,
 -0.3897839393305935]