# Model 3 Permutation Importance

In [1]:
# Packages
import sys
import os
from sklearn.inspection import permutation_importance
# Add the parent directory to sys.path
sys.path.append(os.path.abspath('..'))
# Import module
from Logistic_Regression_Functions import *
# Model name
model_name = 'include_previous_rating_model_3'


## Standard Model Code

In [2]:
# Load the data
df = load_data()

In [13]:
# Get the column names and mapping
numeric_feature_columns, cat_feature_columns, target_column, custom_mapping = get_column_names_and_mapping(model_name)

reportedCurrency
acceptedDate_balance_sheet
acceptedDate_cash_flow_statement
acceptedDate_income_statement
financial_statement_date
filingDate
Sector
rating_on_previous_fixed_quarter_date


In [11]:
# Create matrices
_, X_test_scaled, _, y_test = prepare_matrices(df, numeric_feature_columns, cat_feature_columns, target_column, custom_mapping)

preprocessor
['num__EBIT' 'num__common_plus_preferred_stock' 'num__workingCapital' ...
 'cat__rating_on_previous_fixed_quarter_date_CC'
 'cat__rating_on_previous_fixed_quarter_date_CCC'
 'cat__rating_on_previous_fixed_quarter_date_D']
  (0, 0)	-0.5292083154557484
  (0, 1)	-0.3452357944326435
  (0, 2)	-0.28849489822962626
  (0, 3)	1.3236777929080492
  (0, 4)	-0.28882846595864414
  (0, 5)	2.345294086356715
  (0, 6)	-0.22753661813440856
  (0, 7)	0.7025535518052359
  (0, 8)	-0.33336618881781444
  (0, 9)	-1.2577122590728849
  (0, 10)	-0.32802069485569113
  (0, 11)	-1.494733330875567
  (0, 12)	-0.6628498012202643
  (0, 13)	-0.3734995248030832
  (0, 14)	-0.13790526090634894
  (0, 15)	-0.9761690277975827
  (0, 16)	-0.6242719269680083
  (0, 17)	-0.2629491036802998
  (0, 18)	-0.5400106266715411
  (0, 19)	-0.7053855438520991
  (0, 20)	0.48332880436155207
  (0, 21)	-0.5426978179211365
  (0, 22)	-0.6718872275297538
  (0, 23)	1.582486798849689
  (0, 24)	0.10556923691851272
  :	:
  (1091, 106)	-0.556

## Load Trained Model

In [5]:
# Load trained model
model = joblib.load('../../../../Output/Modelling/Logistic Regression/' + model_name + '/' + model_name + '_best_estimator.pkl')
model

## Permutation Importance

In [6]:
# Perform permutation importance
result = permutation_importance(model, X_test_scaled.toarray(), y_test, n_repeats=1000, random_state=222, n_jobs=-1)

# Check lengths
print('shape X: ', X_test_scaled.shape)
print('num + cat: ', len(numeric_feature_columns + cat_feature_columns))
print('importances: ', len(result.importances_mean))
print('stds: ', len(result.importances_std))

# Expanded feature column names
# Version for include previous rating
if 'include_previous_rating' in model_name:
    expanded_cat_cols = ['cat__rating_on_previous_fixed_quarter_date_A',
    'cat__rating_on_previous_fixed_quarter_date_AA',
    'cat__rating_on_previous_fixed_quarter_date_AAA',
    'cat__rating_on_previous_fixed_quarter_date_B',
    'cat__rating_on_previous_fixed_quarter_date_BB',
    'cat__rating_on_previous_fixed_quarter_date_BBB',
    'cat__rating_on_previous_fixed_quarter_date_C',
    'cat__rating_on_previous_fixed_quarter_date_CC',
    'cat__rating_on_previous_fixed_quarter_date_CCC',
    'cat__rating_on_previous_fixed_quarter_date_D',
    'cat__Sector_Communication Services',
    'cat__Sector_Consumer Discretionary',
    'cat__Sector_Consumer Staples',
    'cat__Sector_Energy',
    'cat__Sector_Financials', 
    'cat__Sector_Health Care',
    'cat__Sector_Industrials', 
    'cat__Sector_Information Technology',
    'cat__Sector_Materials',
    'cat__Sector_Real Estate',
    'cat__Sector_Utilities']
# Version for exclude previous rating
if 'exclude_previous_rating' in model_name:
    expanded_cat_cols = ['cat__rating_on_previous_fixed_quarter_date_A',
    'cat__rating_on_previous_fixed_quarter_date_AA',
    'cat__rating_on_previous_fixed_quarter_date_AAA',
    'cat__rating_on_previous_fixed_quarter_date_B',
    'cat__rating_on_previous_fixed_quarter_date_BB',
    'cat__rating_on_previous_fixed_quarter_date_BBB',
    'cat__rating_on_previous_fixed_quarter_date_C',
    'cat__rating_on_previous_fixed_quarter_date_CC',
    'cat__rating_on_previous_fixed_quarter_date_CCC',
    'cat__rating_on_previous_fixed_quarter_date_D']

print('with expanded')
print(len(numeric_feature_columns + expanded_cat_cols))

print('importances mean')
print(result.importances_mean)

print('imp std')
print(result.importances_std)

# Put column name, mean and std in a dataframe
result = pd.DataFrame({'feature': numeric_feature_columns + expanded_cat_cols, 'mean': result.importances_mean, 'std': result.importances_std})

# Output to disk
result.to_parquet('../../../../Output/Modelling/Logistic Regression/' + model_name + '/' + model_name + '_permutation_importance.parquet', index=False)

result

shape X:  (1127, 142)
num + cat:  123
importances:  142
stds:  142
with expanded
142
importances mean
[ 0.00000000e+00  0.00000000e+00 -7.77284827e-04  7.79059450e-04
  4.23247560e-04 -2.81277728e-04 -8.49157054e-04  3.51375333e-04
  0.00000000e+00  1.86335404e-05 -2.04081633e-05 -7.29370009e-04
  1.19787045e-04  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00 -1.05590062e-04  1.58828749e-04 -6.33540373e-04
  0.00000000e+00  6.59272405e-04 -7.68411713e-04  1.06477374e-05
 -3.81543922e-05  5.42147294e-04 -5.32386868e-05  8.66903283e-04
  1.77462289e-05 -1.77462289e-06 -1.08251996e-04 -9.76042591e-06
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -3.10559006e-04
  8.09228039e-04  4.17036380e-05  0.00000000e+00 -2.30700976e-05
  0.00000000e+00 -8.53593611e-04 -4.38331854e-04  0.00000000e+00
  0.00000000e+00  3.19432121e-05  0.00000000e+00  0.00000000e+00
 -1.05590062e-04  3.46051464e-05  7.63087844e-05  2.30700976e-05
  0.00000000e+00  3.19432121e-05 -1.86335404e-05  5.3

Unnamed: 0,feature,mean,std
0,cashAndCashEquivalents,0.000000,0.000000
1,shortTermInvestments,0.000000,0.000000
2,cashAndShortTermInvestments,-0.000777,0.000308
3,netReceivables,0.000779,0.000314
4,inventory_balance_sheet,0.000423,0.000443
...,...,...,...
137,cat__Sector_Industrials,-0.000170,0.000349
138,cat__Sector_Information Technology,0.000000,0.000000
139,cat__Sector_Materials,0.000000,0.000000
140,cat__Sector_Real Estate,0.000000,0.000000
