# Permutation Importance NLP Feature Ranks

In [1]:
# Packages
import pandas as pd
from stattotex import stattotex
# Variables of interest
variables_of_interest = ['Passive', 'Ovrst', 'word_count', 'num_transparency', 'pos_score_finbert', 'gf_score']

## Load Permutation Importance

In [2]:
perm_importance = pd.read_parquet('../../../../Output/Modelling/Logistic Regression/exclude_previous_rating_model_3/exclude_previous_rating_model_3_permutation_importance.parquet')
# Drop 'num__' and 'cat__' from column names
perm_importance['feature'] = perm_importance['feature'].str.replace('num__', '').str.replace('cat__', '')
# Sort and create rank on mean, descending
#perm_importance = perm_importance.sort_values('mean', ascending=False)
perm_importance['rank'] = perm_importance['mean'].rank(ascending=False)
# Make rank an integer
perm_importance['rank'] = perm_importance['rank'].astype(int)
# sort
perm_importance = perm_importance.sort_values('rank')
perm_importance

Unnamed: 0,feature,mean,std,rank
7,Ratio_E,0.070625,0.009156,1
13,Passive,0.056786,0.007741,2
136,Sector_Utilities,0.043208,0.005661,3
102,interestExpense,0.043019,0.007802,4
6,Ratio_D,0.041765,0.007607,5
...,...,...,...,...
131,Sector_Health Care,-0.002743,0.002935,133
86,cashAtBeginningOfPeriod,-0.003379,0.004213,134
67,accountsPayables,-0.003928,0.001926,135
11,Weak,-0.004171,0.004949,136


## Load Variable Index

In [3]:
variable_index = pd.read_excel('../../../../Variable Index.xlsx')
variable_index

Unnamed: 0,column_name,Clean Column Name,Variable Type,Data Type,Ratio?,Notes,Rating Model 1,Rating Model 2,Rating Model 3,Change Model 1,Change Model 2,Change Model 3
0,Altman_Z,Altman's Z Score,Altman's Z Score,Numeric,Y,,X,,,X,,
1,EBIT,EBIT,Constructed for Altman's Z,Numeric,,,,X,X,,X,X
2,common_plus_preferred_stock,Common Plus Preferred Stock,Constructed for Altman's Z,Numeric,,,,X,X,,X,X
3,workingCapital,Working Capital,Constructed for Altman's Z,Numeric,,,,X,X,,X,X
4,Ratio_A,Ratio A,Constructed for Altman's Z,Numeric,Y,,,X,X,,X,X
...,...,...,...,...,...,...,...,...,...,...,...,...
200,operatingCashFlowPerShare_diff,Difference in Operating Cash Flow Per Share fr...,Additional Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X
201,freeCashFlowPerShare_diff,Difference in Free Cash Flow Per Share from pr...,Additional Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X
202,cashPerShare_diff,Difference in Cash Per Share from prior fixed ...,Additional Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X
203,operatingCashFlowToSales_diff,Difference in Operating Cash Flow to Sales fro...,Additional Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X


In [4]:
# Values of Variable Type
variable_index['Variable Type'].value_counts()

Variable Type
Financial Statements          108
Additional Change Ratios       17
Additional Ratios              17
Constructed for Tone           12
Change Ratios                  11
Disallowed                      8
Constructed for Altman's Z      8
Metadata                        7
NLP Feature                     7
Predicted - Change              2
Predicted - Rating              2
Previous Rating                 1
Sector                          1
Train-Test Split                1
Transcript Text                 1
Market Capitalization           1
Altman's Z Score                1
Name: count, dtype: int64

## Merge

In [5]:
combined = perm_importance.merge(variable_index, left_on='feature', right_on='column_name', how='left')
# Limit to Variable Type of 'Constructed for Tone', 'NLP Feature'
combined = combined[combined['Variable Type'].isin(['Constructed for Tone', 'NLP Feature'])]
# Limit to feature in variables of interest
#combined = combined[combined['feature'].isin(variables_of_interest)]
# Rank for Negativ
rneg = combined.query('feature == "Negativ"')['rank'].values[0]
print(rneg)
# Stattotex
stattotex(rneg, 'rNeg', '../../../../Output/Modelling/Logistic Regression/exclude_previous_rating_model_3/permutation_importance_nlp_ranks.tex')
# Rank for pos_score_finbert
pos_score_finbert = combined.query('feature == "pos_score_finbert"')['rank'].values[0]
stattotex(pos_score_finbert, 'rPosScoreFinbert', '../../../../Output/Modelling/Logistic Regression/exclude_previous_rating_model_3/permutation_importance_nlp_ranks.tex')
# Rank for Undrst
undrst = combined.query('feature == "Undrst"')['rank'].values[0]
stattotex(undrst, 'rUndrst', '../../../../Output/Modelling/Logistic Regression/exclude_previous_rating_model_3/permutation_importance_nlp_ranks.tex')
# Rank for num_transparency
num_transparency = combined.query('feature == "num_transparency"')['rank'].values[0]
stattotex(num_transparency, 'rNumTransparency', '../../../../Output/Modelling/Logistic Regression/exclude_previous_rating_model_3/permutation_importance_nlp_ranks.tex')
# Rank for word_count
word_count = combined.query('feature == "word_count"')['rank'].values[0]
stattotex(word_count, 'rWordCount', '../../../../Output/Modelling/Logistic Regression/exclude_previous_rating_model_3/permutation_importance_nlp_ranks.tex')
# Rank for gf_score
gf_score = combined.query('feature == "gf_score"')['rank'].values[0]
stattotex(gf_score, 'rGfScore', '../../../../Output/Modelling/Logistic Regression/exclude_previous_rating_model_3/permutation_importance_nlp_ranks.tex')
combined

66


Unnamed: 0,feature,mean,std,rank,column_name,Clean Column Name,Variable Type,Data Type,Ratio?,Notes,Rating Model 1,Rating Model 2,Rating Model 3,Change Model 1,Change Model 2,Change Model 3
1,Passive,0.056786,0.007741,2,Passive,Passive Tone,Constructed for Tone,Numeric,,,,,X,,,X
8,word_count,0.035743,0.007747,9,word_count,Word Count,NLP Feature,Numeric,,,,,X,,,X
20,Ovrst,0.020056,0.006795,21,Ovrst,Overstated Tone,Constructed for Tone,Numeric,,,,,X,,,X
23,Active,0.018226,0.006393,24,Active,Active Tone,Constructed for Tone,Numeric,,,,,X,,,X
29,Strong,0.016614,0.00547,30,Strong,Strong Tone,Constructed for Tone,Numeric,,,,,X,,,X
37,Positiv,0.012316,0.005619,38,Positiv,Positive Tone,Constructed for Tone,Numeric,,,,,X,,,X
52,Undrst,0.007882,0.005675,53,Undrst,Understated Tone,Constructed for Tone,Numeric,,,,,X,,,X
58,num_q_by_len,0.006559,0.00423,59,num_q_by_len,Number of Questions Divided By Call Word Count,NLP Feature,Numeric,,"Replaces old questions variable, less colinear...",,,X,,,X
65,Negativ,0.005271,0.005317,66,Negativ,Negative Tone,Constructed for Tone,Numeric,,,,,X,,,X
88,num_transparency,0.002789,0.003413,89,num_transparency,Numeric Transparency,NLP Feature,Numeric,,,,,X,,,X
