# Replication Sample

This code replicates our main results for our most important model: XGBoost excluding previous rating including the financial variables and NLP features, without grid search.

In [1]:
# Packages
import os
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, confusion_matrix
# permutation importance
from sklearn.inspection import permutation_importance
# Kill warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Set model name
model_name = 'exclude_previous_rating_model_3'

## Load Data

In [3]:
# Load replication_sample.parquet
df = pd.read_parquet('replication_sample.parquet')

## Get Column Names

In [4]:
# Load variable index excel file
variable_index = pd.read_excel('Variable Index.xlsx')

# Model name column
# Clean model name is 'Rating Model' plus the number (last character)
clean_model_name = 'Rating Model ' + model_name[-1]

# Numeric features
# Values of column_name where clean_model_name is X, and Data Type is Numeric
numeric_feature_columns = variable_index[(variable_index[clean_model_name] == 'X') & (variable_index['Data Type'] == 'Numeric')]['column_name'].tolist()
# Categorical features
# Values of column_name where clean_model_name is X, and Data Type is not Numeric
cat_feature_columns = variable_index[(variable_index[clean_model_name] == 'X') & (variable_index['Data Type'] != 'Numeric')]['column_name'].tolist()

# Target column
# Values of column_name where column called model_name is Y
target_column = variable_index[variable_index[clean_model_name] == 'Y']['column_name'].values[0]

# Mapping for target column
custom_mapping = {'AAA': 0, 'AA': 1, 'A': 2, 'BBB': 3, 'BB': 4, 'B': 5, 'CCC': 6, "CC": 7, "C": 8, "D": 9}

## Prepare Matrices

In [5]:
# Selecting features and target, and encoding target
train_df = df[df['train_test_80_20'] == 'train'].sort_values(by=['ticker', 'fixed_quarter_date'])
test_df = df[df['train_test_80_20'] == 'test'].sort_values(by=['ticker', 'fixed_quarter_date'])
train_numeric_X = train_df[numeric_feature_columns]
train_cat_X = train_df[cat_feature_columns]
test_numeric_X = test_df[numeric_feature_columns]
test_cat_X = test_df[cat_feature_columns]
X_train = pd.concat([train_numeric_X, train_cat_X], axis=1)
X_test = pd.concat([test_numeric_X, test_cat_X], axis=1)
y_train =  train_df[target_column].map(custom_mapping)
y_test = test_df[target_column].map(custom_mapping)

# Preprocessing
numeric_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_feature_columns),
        ('cat', cat_transformer, cat_feature_columns)
    ]
)
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

print('feature names: ')
print(preprocessor.get_feature_names_out())
feature_names = preprocessor.get_feature_names_out()

feature names: 
['num__EBIT' 'num__common_plus_preferred_stock' 'num__workingCapital'
 'num__Ratio_A' 'num__Ratio_B' 'num__Ratio_C' 'num__Ratio_D'
 'num__Ratio_E' 'num__Positiv' 'num__Negativ' 'num__Strong' 'num__Weak'
 'num__Active' 'num__Passive' 'num__Ovrst' 'num__Undrst'
 'num__cashAndCashEquivalents' 'num__shortTermInvestments'
 'num__cashAndShortTermInvestments' 'num__netReceivables'
 'num__inventory_balance_sheet' 'num__otherCurrentAssets'
 'num__totalCurrentAssets' 'num__propertyPlantEquipmentNet'
 'num__goodwill' 'num__intangibleAssets'
 'num__goodwillAndIntangibleAssets' 'num__longTermInvestments'
 'num__taxAssets' 'num__otherNonCurrentAssets'
 'num__totalNonCurrentAssets' 'num__otherAssets' 'num__totalAssets'
 'num__accountPayables' 'num__shortTermDebt' 'num__taxPayables'
 'num__deferredRevenue' 'num__otherCurrentLiabilities'
 'num__totalCurrentLiabilities' 'num__longTermDebt'
 'num__deferredRevenueNonCurrent' 'num__deferredTaxLiabilitiesNonCurrent'
 'num__otherNonCurrentLia

## Train Model

In [6]:
# Set hyperparameters
hyperparameters = {'booster': 'gbtree', 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 20, 'min_child_weight': 5, 'n_estimators': 1000, 'objective': 'multi:softprob'}

# Define the XGBoost model
model = xgb.XGBClassifier(num_class=len(set(y_test)), n_jobs=-1)

# Fit model
model.set_params(**hyperparameters)
model.fit(X_train_scaled, y_train)

## Evaluate

In [7]:
# Model prediction
y_pred = model.predict(X_test_scaled)

In [8]:
# Accuracy and majority class share baseline
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ', accuracy)
majority_class_share_baseline = y_test.value_counts(normalize=True).max()
print('Majority class share baseline: ', majority_class_share_baseline)

Accuracy:  0.909660107334526
Majority class share baseline:  0.32468694096601075


## Permutation Importance

In [9]:
# Perform permutation importance
# Note we only perform 5 repeats here to save time
pi = permutation_importance(model, X_test_scaled, y_test, n_repeats=5, random_state=222, n_jobs=-1)


In [10]:
# Put column name, mean and std in a dataframe
pi_df = pd.DataFrame({'feature': feature_names, 'mean': pi.importances_mean, 'std': pi.importances_std})
pi_df = pi_df.sort_values(by='mean', ascending=False)
pi_df

Unnamed: 0,feature,mean,std
49,num__retainedEarnings,0.036494,0.003551
118,num__marketCap,0.032558,0.005589
80,num__dividendsPaid,0.018605,0.002291
132,num__debtRatio,0.013238,0.004172
51,num__othertotalStockholdersEquity,0.012522,0.002592
...,...,...,...
140,num__operatingCashFlowToSales,-0.001431,0.002008
73,num__purchasesOfInvestments,-0.001610,0.000669
72,num__acquisitionsNet,-0.001610,0.000669
67,num__accountsPayables,-0.001789,0.000980
