# Permutation Importance NLP Feature Ranks

In [73]:
# Packages
import pandas as pd
from stattotex import stattotex
# Variables of interest
variables_of_interest = ['Passive', 'Ovrst', 'word_count', 'num_transparency', 'pos_score_finbert', 'gf_score']

## Load Permutation Importance

In [74]:
perm_importance = pd.read_parquet('../../../../Output/Modelling/XGBoost/exclude_previous_rating_model_3/exclude_previous_rating_model_3_permutation_importance.parquet')
# Drop 'num__' and 'cat__' from column names
perm_importance['feature'] = perm_importance['feature'].str.replace('num__', '').str.replace('cat__', '')
# Sort and create rank on mean, descending
#perm_importance = perm_importance.sort_values('mean', ascending=False)
perm_importance['rank'] = perm_importance['mean'].rank(ascending=False)
# Make rank an integer
perm_importance['rank'] = perm_importance['rank'].astype(int)
# sort
perm_importance = perm_importance.sort_values('rank')
perm_importance

Unnamed: 0,feature,mean,std,rank
49,retainedEarnings,0.043819,0.005761,1
118,marketCap,0.035169,0.005735,2
80,dividendsPaid,0.021455,0.004481,3
132,debtRatio,0.009987,0.003413,4
48,commonStock,0.009693,0.002248,5
...,...,...,...,...
2,workingCapital,-0.001648,0.001732,149
138,freeCashFlowPerShare,-0.001751,0.000764,150
89,freeCashFlow,-0.001911,0.000694,151
81,otherFinancingActivites,-0.002434,0.001301,152


## Load Variable Index

In [75]:
variable_index = pd.read_excel('../../../../Variable Index.xlsx')
variable_index

Unnamed: 0,column_name,Clean Column Name,Variable Type,Data Type,Ratio?,Notes,Rating Model 1,Rating Model 2,Rating Model 3,Change Model 1,Change Model 2,Change Model 3
0,Altman_Z,Altman's Z Score,Altman's Z Score,Numeric,Y,,X,,,X,,
1,EBIT,EBIT,Constructed for Altman's Z,Numeric,,,,X,X,,X,X
2,common_plus_preferred_stock,Common Plus Preferred Stock,Constructed for Altman's Z,Numeric,,,,X,X,,X,X
3,workingCapital,Working Capital,Constructed for Altman's Z,Numeric,,,,X,X,,X,X
4,Ratio_A,Ratio A,Constructed for Altman's Z,Numeric,Y,,,X,X,,X,X
...,...,...,...,...,...,...,...,...,...,...,...,...
200,operatingCashFlowPerShare_diff,Difference in Operating Cash Flow Per Share fr...,Additional Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X
201,freeCashFlowPerShare_diff,Difference in Free Cash Flow Per Share from pr...,Additional Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X
202,cashPerShare_diff,Difference in Cash Per Share from prior fixed ...,Additional Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X
203,operatingCashFlowToSales_diff,Difference in Operating Cash Flow to Sales fro...,Additional Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X


In [76]:
# Values of Variable Type
variable_index['Variable Type'].value_counts()

Variable Type
Financial Statements          108
Additional Change Ratios       17
Additional Ratios              17
Constructed for Tone           12
Change Ratios                  11
Disallowed                      8
Constructed for Altman's Z      8
Metadata                        7
NLP Feature                     7
Predicted - Change              2
Predicted - Rating              2
Previous Rating                 1
Sector                          1
Train-Test Split                1
Transcript Text                 1
Market Capitalization           1
Altman's Z Score                1
Name: count, dtype: int64

## Merge

In [77]:
combined = perm_importance.merge(variable_index, left_on='feature', right_on='column_name', how='left')
# Limit to Variable Type of 'Constructed for Tone', 'NLP Feature'
combined = combined[combined['Variable Type'].isin(['Constructed for Tone', 'NLP Feature'])]
# Limit to feature in variables of interest
#combined = combined[combined['feature'].isin(variables_of_interest)]
# Rank for Negativ
rneg = combined.query('feature == "Negativ"')['rank'].values[0]
print(rneg)
# Stattotex
stattotex(rneg, 'rNeg', '../../../../Output/Modelling/XGBoost/exclude_previous_rating_model_3/permutation_importance_nlp_ranks.tex')
# Rank for pos_score_finbert
pos_score_finbert = combined.query('feature == "pos_score_finbert"')['rank'].values[0]
stattotex(pos_score_finbert, 'rPosScoreFinbert', '../../../../Output/Modelling/XGBoost/exclude_previous_rating_model_3/permutation_importance_nlp_ranks.tex')
# Rank for Undrst
undrst = combined.query('feature == "Undrst"')['rank'].values[0]
stattotex(undrst, 'rUndrst', '../../../../Output/Modelling/XGBoost/exclude_previous_rating_model_3/permutation_importance_nlp_ranks.tex')
# Rank for num_transparency
num_transparency = combined.query('feature == "num_transparency"')['rank'].values[0]
stattotex(num_transparency, 'rNumTransparency', '../../../../Output/Modelling/XGBoost/exclude_previous_rating_model_3/permutation_importance_nlp_ranks.tex')
# Rank for word_count
word_count = combined.query('feature == "word_count"')['rank'].values[0]
stattotex(word_count, 'rWordCount', '../../../../Output/Modelling/XGBoost/exclude_previous_rating_model_3/permutation_importance_nlp_ranks.tex')
# Rank for gf_score
gf_score = combined.query('feature == "gf_score"')['rank'].values[0]
stattotex(gf_score, 'rGfScore', '../../../../Output/Modelling/XGBoost/exclude_previous_rating_model_3/permutation_importance_nlp_ranks.tex')
combined

39


Unnamed: 0,feature,mean,std,rank,column_name,Clean Column Name,Variable Type,Data Type,Ratio?,Notes,Rating Model 1,Rating Model 2,Rating Model 3,Change Model 1,Change Model 2,Change Model 3
38,Negativ,0.001677,0.001105,39,Negativ,Negative Tone,Constructed for Tone,Numeric,,,,,X,,,X
41,pos_score_finbert,0.001424,0.001538,42,pos_score_finbert,FinBERT Positivity Score,NLP Feature,Numeric,,Supersedes pos_score,,,X,,,X
51,Undrst,0.000966,0.001224,52,Undrst,Understated Tone,Constructed for Tone,Numeric,,,,,X,,,X
67,num_transparency,0.000681,0.001024,68,num_transparency,Numeric Transparency,NLP Feature,Numeric,,,,,X,,,X
69,Ovrst,0.000568,0.001041,70,Ovrst,Overstated Tone,Constructed for Tone,Numeric,,,,,X,,,X
71,Positiv,0.000522,0.000965,72,Positiv,Positive Tone,Constructed for Tone,Numeric,,,,,X,,,X
75,tone,0.000372,0.001062,76,tone,First Principal Component of Tone,NLP Feature,Numeric,,,,,X,,,X
86,num_q_by_len,9.9e-05,0.000858,87,num_q_by_len,Number of Questions Divided By Call Word Count,NLP Feature,Numeric,,"Replaces old questions variable, less colinear...",,,X,,,X
89,Strong,8.5e-05,0.000727,90,Strong,Strong Tone,Constructed for Tone,Numeric,,,,,X,,,X
92,Weak,2.3e-05,0.001397,93,Weak,Weak Tone,Constructed for Tone,Numeric,,,,,X,,,X
