## 5. Feature Importance and Permutation feature importance
In this notebook we are looking at the feature importances for the Random Forest Regressor.

In [31]:
# load modules
import sys
sys.path.append("..")
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from sklearn.model_selection import cross_val_predict
from modeling.features import get_feature_combinations
from sklearn.svm import SVR
from lightgbm import LGBMRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler


In [32]:
# loading the dataset
data = pd.read_csv('../data/GEFCom2014Data/Wind/clean_data.csv', 
                    parse_dates= ['TIMESTAMP'],
                    index_col= 'TIMESTAMP' )

In [33]:
# train-test-split and get features
data_train = data[:'2013-07-01 00:00:00']
data_test = data['2013-07-01 01:00:00']
feature_dict = get_feature_combinations()

### Feature importance

In [36]:
from sklearn.inspection import permutation_importance

df_feature_imp = pd.DataFrame()
#model_obj = RandomForestRegressor()

result_dict = {}
result_dict['RandomForestRegressor'] = pd.read_csv('../results/RandomForestRegressor.csv', index_col='ZONE')
result_dict['RandomForestRegressor']['ZONEID'] = range(1,11) 
df_model = result_dict['RandomForestRegressor']    
df_model
 
for zone in df_model.ZONEID.unique():
    # obtain best derived feature combination for zone an model
    fc = df_model[df_model.ZONEID == zone]['FC'].values[0]
    
    # obtain data frame for zone
    data_train_zone = data_train[data_train.ZONEID == zone]
    data_test_zone = data_test[data_test.ZONEID == zone]

    # split in X and y
    X_train = data_train_zone[feature_dict[fc]]
    y_train = data_train_zone.TARGETVAR
    X_test = data_test_zone[feature_dict[fc]]
    y_test = data_test_zone.TARGETVAR

    # obtain best model params
    best_params = df_model[df_model.ZONEID == zone]['BEST_PARAMS'].values[0]
    
    # define and fit model
    model = RandomForestRegressor().set_params(**ast.literal_eval(best_params)).fit(X_train, y_train)

    tmp_df_feat_imp = pd.DataFrame(index=feature_dict[fc])
    tmp_df_feat_imp['MODEL'] = 'RandomForestRegressor'
    tmp_df_feat_imp['FC'] = fc
    tmp_df_feat_imp['ZONE'] = zone
    #tmp_df_feat_imp['FEAT_IMP'] = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42, 
     #                                                   scoring='neg_root_mean_squared_error')['importances_mean']
    print('zone: ',zone)
    result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
    print("result :", result.importances_mean)                                                  
    df_feature_imp = df_feature_imp.append(tmp_df_feat_imp)

#df_feature_imp


zone:  1
result : [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan]
zone:  2


KeyboardInterrupt: 

Plotting the feature importance for Random Forest

In [None]:
df_feature_imp.to_csv('../results/feature_imp_perm.csv')
df_feature_imp['FEATURE'] = df_feature_imp.index

df_feature_imp_RF = df_feature_imp[df_feature_imp.MODEL == "RandomForestRegressor"]

custom_params = {"axes.spines.right": False, "axes.spines.top": False}

sns.set_theme(style = 'whitegrid', rc = custom_params, palette="dark:b", font_scale= 1.5)

plt.rcParams['figure.figsize'] = (10, 5)
plt.rcParams['figure.dpi'] = 400

feat_imp_rf = df_feature_imp_RF.groupby(['FEATURE', 'MODEL']).sum() / df_feature_imp_RF.groupby(['FEATURE']).count()

feat_imp_rf = feat_imp_rf.droplevel('MODEL').drop('ZONE', axis = 1).sort_values(by = 'FEAT_IMP', ascending = False)
feat_imp_rf

sns.barplot(x = feat_imp_rf.head(5).index,y = feat_imp_rf['FEAT_IMP'].head(5))
plt.ylabel('Feature Importance')



### Permutation feature importance