In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
import pickle
import pandas as pd
import glob, os

# Add the path to the directory containing the module
import sys
sys.path.append('../../')
from util.ml import baseline, metrics

In [2]:
best_rf_rf_baseline = baseline.load_pickle('../../datas/proc/part1/rf/rf_rf_20_cdf.pkl')

In [9]:

pd.DataFrame({'value':best_rf_rf_baseline[2].feature_importances_, 
              'name':['10m_u_component_of_wind_std_PCA_2', 'surface_pressure_mean_PCA_2','10m_v_component_of_wind_max_PCA_1',
 'mean_sea_level_pressure_max_PCA_2', 'surface_pressure_max_PCA_1','surface_pressure_max_PCA_2', 'geopotential_1000_std_PCA_2','2m_dewpoint_temperature_std_PCA_2','2m_dewpoint_temperature_std_PCA_3']})

Unnamed: 0,value,name
0,0.2,10m_u_component_of_wind_std_PCA_2
1,0.14,surface_pressure_mean_PCA_2
2,0.04,10m_v_component_of_wind_max_PCA_1
3,0.12,mean_sea_level_pressure_max_PCA_2
4,0.14,surface_pressure_max_PCA_1
5,0.08,surface_pressure_max_PCA_2
6,0.08,geopotential_1000_std_PCA_2
7,0.12,2m_dewpoint_temperature_std_PCA_2
8,0.08,2m_dewpoint_temperature_std_PCA_3


In [3]:
# Find the folder name organized by seed number
seed_docs = sorted(glob.glob('../../datas/seed_*/'))

model_dict = []
# Read files in a for loop
for seed_doc in seed_docs:
    model_RF_cdf_20 = pickle.load(open(seed_doc + 'model_random_forest/model_cdf_20.pkl', 'rb'))
    model_RF_cdf_30 = pickle.load(open(seed_doc + 'model_random_forest/model_cdf_30.pkl', 'rb'))
    model_RF_cdf_40 = pickle.load(open(seed_doc + 'model_random_forest/model_cdf_40.pkl', 'rb'))

    model_RF_max_20 = pickle.load(open(seed_doc + 'model_random_forest/model_max_20.pkl', 'rb'))
    model_RF_max_30 = pickle.load(open(seed_doc + 'model_random_forest/model_max_30.pkl', 'rb'))
    model_RF_max_40 = pickle.load(open(seed_doc + 'model_random_forest/model_max_40.pkl', 'rb'))

    model_dict.append({
        'model_RF_cdf_20': model_RF_cdf_20,
        'model_RF_cdf_30': model_RF_cdf_30,
        'model_RF_cdf_40': model_RF_cdf_40,
        'model_RF_max_20': model_RF_max_20,
        'model_RF_max_30': model_RF_max_30,
        'model_RF_max_40': model_RF_max_40
    })

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [5]:
# Find the folder name organized by seed number
seed_docs = sorted(glob.glob('../../datas/seed_*/'))

data_dict = []
# Read files in a for loop
for seed_doc in seed_docs:
    # Read input data
    # Filter out the columns associated with convective_precipitation and large_scale_snowfall (model variables, not easily retrievable in observations)
    Xtrain_40 = baseline.filter_columns(pd.read_csv(seed_doc + 'X_train_40.csv'),"^(convective_precipitation_|large_scale_snowfall_)")
    Xtrain_30 = baseline.filter_columns(pd.read_csv(seed_doc + 'X_train_30.csv'),"^(convective_precipitation_|large_scale_snowfall_)")
    Xtrain_20 = baseline.filter_columns(pd.read_csv(seed_doc + 'X_train_20.csv'),"^(convective_precipitation_|large_scale_snowfall_)")
    Xvalid_40 = baseline.filter_columns(pd.read_csv(seed_doc + 'X_validation_40.csv'),"^(convective_precipitation_|large_scale_snowfall_)")
    Xvalid_30 = baseline.filter_columns(pd.read_csv(seed_doc + 'X_validation_30.csv'),"^(convective_precipitation_|large_scale_snowfall_)")
    Xvalid_20 = baseline.filter_columns(pd.read_csv(seed_doc + 'X_validation_20.csv'),"^(convective_precipitation_|large_scale_snowfall_)")

    # Read output data
    ytrain_cdf = pd.read_csv(seed_doc + 'y_train_cdf.csv')
    ytrain_max = pd.read_csv(seed_doc + 'y_train_max.csv')
    yvalid_cdf = pd.read_csv(seed_doc + 'y_validation_cdf.csv')
    yvalid_max = pd.read_csv(seed_doc + 'y_validation_max.csv')
    
    data = {
        'Xtrain_40': Xtrain_40,
        'Xtrain_30': Xtrain_30,
        'Xtrain_20': Xtrain_20,
        'Xvalid_40': Xvalid_40,
        'Xvalid_30': Xvalid_30,
        'Xvalid_20': Xvalid_20,
        'ytrain_cdf': ytrain_cdf,
        'ytrain_max': ytrain_max,
        'yvalid_cdf': yvalid_cdf,
        'yvalid_max': yvalid_max
    }
    data_dict.append(data)

# Find the name of the columns that we removed from the data
column_name_40 = pd.read_csv(seed_doc + 'X_train_40.csv').filter(regex="^(convective_precipitation_|large_scale_snowfall_)").columns
column_name_30 = pd.read_csv(seed_doc + 'X_train_30.csv').filter(regex="^(convective_precipitation_|large_scale_snowfall_)").columns
column_name_20 = pd.read_csv(seed_doc + 'X_train_20.csv').filter(regex="^(convective_precipitation_|large_scale_snowfall_)").columns

# All column indices
varname_40 = pd.read_csv(seed_doc + 'X_train_40.csv').columns
varname_30 = pd.read_csv(seed_doc + 'X_train_30.csv').columns
varname_20 = pd.read_csv(seed_doc + 'X_train_20.csv').columns

# Column indices for the removed variables
filtindex_40 = [list(varname_40).index(list(column_name_40)[i]) for i in range(len(list(column_name_40)))]
filtindex_30 = [list(varname_30).index(list(column_name_30)[i]) for i in range(len(list(column_name_30)))]
filtindex_20 = [list(varname_20).index(list(column_name_20)[i]) for i in range(len(list(column_name_20)))]

In [6]:
# Find the feature importance of the RF model
feature_importance_20_cdf = []
feature_importance_30_cdf = []
feature_importance_40_cdf = []
feature_importance_20_max = []
feature_importance_30_max = []
feature_importance_40_max = []

for ind in range(len(model_dict)):
    feature_importance_20_cdf.append(model_dict[ind]['model_RF_cdf_20'].feature_importances_)
    feature_importance_30_cdf.append(model_dict[ind]['model_RF_cdf_30'].feature_importances_)
    feature_importance_40_cdf.append(model_dict[ind]['model_RF_cdf_40'].feature_importances_)
    feature_importance_20_max.append(model_dict[ind]['model_RF_max_20'].feature_importances_)
    feature_importance_30_max.append(model_dict[ind]['model_RF_max_30'].feature_importances_)
    feature_importance_40_max.append(model_dict[ind]['model_RF_max_40'].feature_importances_)

# Find index where feature importance is greater than 0
index_20_cdf = [[ind for ind, x in enumerate(feature_importance_20_cdf[i]) if x > 0] for i in range(len(feature_importance_20_cdf))]
index_30_cdf = [[ind for ind, x in enumerate(feature_importance_30_cdf[i]) if x > 0] for i in range(len(feature_importance_30_cdf))]
index_40_cdf = [[ind for ind, x in enumerate(feature_importance_40_cdf[i]) if x > 0] for i in range(len(feature_importance_40_cdf))]
index_20_max = [[ind for ind, x in enumerate(feature_importance_20_max[i]) if x > 0] for i in range(len(feature_importance_20_max))]
index_30_max = [[ind for ind, x in enumerate(feature_importance_30_max[i]) if x > 0] for i in range(len(feature_importance_30_max))]
index_40_max = [[ind for ind, x in enumerate(feature_importance_40_max[i]) if x > 0] for i in range(len(feature_importance_40_max))]

# Process the index to remove the columns associated with convective_precipitation and large_scale_snowfall
index_20_filt_cdf = [baseline.filt_index_no_conv_snow(index_40_cdf[i],index_30_cdf[i],index_20_cdf[i],filtindex_40, filtindex_30, filtindex_20)[0] for i in range(len(index_20_cdf))]
index_30_filt_cdf = [baseline.filt_index_no_conv_snow(index_40_cdf[i],index_30_cdf[i],index_20_cdf[i],filtindex_40, filtindex_30, filtindex_20)[1] for i in range(len(index_30_cdf))]
index_40_filt_cdf = [baseline.filt_index_no_conv_snow(index_40_cdf[i],index_30_cdf[i],index_20_cdf[i],filtindex_40, filtindex_30, filtindex_20)[2] for i in range(len(index_40_cdf))]
index_20_filt_max = [baseline.filt_index_no_conv_snow(index_40_max[i],index_30_max[i],index_20_max[i],filtindex_40, filtindex_30, filtindex_20)[0] for i in range(len(index_20_max))]
index_30_filt_max = [baseline.filt_index_no_conv_snow(index_40_max[i],index_30_max[i],index_20_max[i],filtindex_40, filtindex_30, filtindex_20)[1] for i in range(len(index_30_max))]
index_40_filt_max = [baseline.filt_index_no_conv_snow(index_40_max[i],index_30_max[i],index_20_max[i],filtindex_40, filtindex_30, filtindex_20)[2] for i in range(len(index_40_max))]
            
# Filter the input data with the feature importance
data_dict_filt_cdf_rf = [baseline.filt_with_feature_importance(index_20_filt_cdf[ind], index_30_filt_cdf[ind], index_40_filt_cdf[ind], data_dict[ind]) for ind in range(len(data_dict))]
data_dict_filt_max_rf = [baseline.filt_with_feature_importance(index_20_filt_max[ind], index_30_filt_max[ind], index_40_filt_max[ind], data_dict[ind]) for ind in range(len(data_dict))]

In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
bestmodel_20 = best_rf_rf_baseline

scaler = StandardScaler()
scaler.fit(data_dict_filt_cdf_rf[2][f'Xtrain_{int(20)}'])
X_train_scaled = scaler.transform(data_dict_filt_cdf_rf[2][f'Xtrain_{int(20)}'])
X_val_scaled = scaler.transform(data_dict_filt_cdf_rf[2][f'Xvalid_{int(20)}'])

In [17]:
def r2_manual(y_true, y_pred):
    ss_total = np.sum((y_true - np.mean(y_true))**2)
    ss_residual = np.sum((y_true - y_pred)**2)
    return 1 - (ss_residual / ss_total)


In [18]:
[r2_manual(data_dict_filt_cdf_rf[2]['yvalid_cdf'].iloc[:,i],bestmodel_20[2].predict(X_val_scaled)[:,i]) for i in range(15)]

[-0.0011731793446621008,
 0.020981857834270534,
 0.024183007637168852,
 -0.25969290595737826,
 0.02250231664585889,
 -0.025886143748740542,
 -0.12449226087826037,
 -0.013835018755522732,
 -0.1439959842950127,
 -0.17650393406430376,
 0.06330607160659141,
 0.15531777991584517,
 0.14608810876850897,
 -0.02070772941018184,
 -0.1821248377915814]