# Other Classifiers on GNN Data

In [1]:
# Packages
import pandas as pd
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib
from stattotex import *
# Kill warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Set model name
model_name = 'exclude_previous_rating_model_3'

## Load Data

In [3]:
# Load data
df = pd.read_excel('../../../Data/Learnable Network/feature_and_class_df.xlsx')
df

Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,rating_date,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,next_rating_date_or_end_of_data,...,Ovrst,Undrst,PN,SW,AP,OU,tone,num_q_by_len,train_test_80_20,node
0,AAPL,2014-10-01,2014-07-22,AA,2014-05-27,AA,2015-02-18,AAA,2014-04-24,2015-02-18,...,364,131,5.518519,15.261905,2.661290,2.778626,3.188264,0.003822,train,1142
1,AAPL,2015-01-01,2014-10-20,AA,2014-05-27,AA,2015-02-18,AAA,2014-04-24,2015-02-18,...,465,152,5.348485,15.934783,3.296482,3.059211,3.681858,0.002766,train,2685
2,AAPL,2015-04-01,2015-01-27,AA,2015-02-18,AA,2015-05-28,AA,2014-05-27,2015-05-28,...,468,151,3.927711,8.113636,2.841346,3.099338,1.307366,0.004628,train,3985
3,AAPL,2015-07-01,2015-04-27,AA,2015-06-02,AA,2015-08-25,AA,2015-05-28,2015-08-25,...,415,135,5.250000,9.142857,2.640187,3.074074,2.025933,0.003861,train,4681
4,AAPL,2015-10-01,2015-07-21,AA,2015-08-25,AA,2016-05-20,AA,2015-06-02,2016-05-20,...,449,148,4.209877,10.442857,2.579909,3.033784,1.815531,0.003915,train,2690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4824,ZTS,2015-10-01,2015-08-04,BBB,2015-01-30,BBB,2015-11-03,BBB,2014-01-31,2015-11-03,...,298,148,3.611650,15.634615,2.911215,2.013514,1.744657,0.001458,train,4150
4825,ZTS,2016-01-01,2015-11-03,BBB,2015-11-03,BBB,2016-01-22,BBB,2015-01-30,2016-01-22,...,395,222,3.766917,15.848101,2.791667,1.779279,1.596294,0.003859,train,833
4826,ZTS,2016-04-01,2016-02-16,BBB,2016-01-22,BBB,2016-12-23,BBB,2015-11-03,2016-12-23,...,469,217,3.565517,17.506849,2.926829,2.161290,2.287146,0.003928,train,3236
4827,ZTS,2016-07-01,2016-05-04,BBB,2016-01-22,BBB,2016-12-23,BBB,2015-11-03,2016-12-23,...,449,215,3.572650,15.235294,3.023715,2.088372,1.739992,0.003182,train,871


## Get Column Names

In [4]:
# Load variable index excel file
variable_index = pd.read_excel('../../../Variable Index.xlsx')

# Model name column
# Clean model name is 'Rating Model' plus the number (last character)
clean_model_name = 'Rating Model ' + model_name[-1]

# Numeric features
# Values of column_name where clean_model_name is X, and Data Type is Numeric
numeric_feature_columns = variable_index[(variable_index[clean_model_name] == 'X') & (variable_index['Data Type'] == 'Numeric')]['column_name'].tolist()
# Categorical features
# Values of column_name where clean_model_name is X, and Data Type is not Numeric
cat_feature_columns = variable_index[(variable_index[clean_model_name] == 'X') & (variable_index['Data Type'] != 'Numeric')]['column_name'].tolist()

# Target column
# Values of column_name where column called model_name is Y
target_column = variable_index[variable_index[clean_model_name] == 'Y']['column_name'].values[0]

# Mapping for target column
rating_to_int = pd.read_excel('../../../Data/Learnable Network/rating_to_int.xlsx')
# Create dictionary to encode the target variable
custom_mapping = {rating_to_int['Rating'][i]: rating_to_int['Rating_int'][i] for i in range(len(rating_to_int))}
print(custom_mapping)

{'AAA': 0, 'AA': 1, 'A': 2, 'BBB': 3, 'BB': 4, 'B': 5, 'CCC': 6, 'C': 7, 'D': 8}


## Prepare Matrices

In [5]:
# Selecting features and target, and encoding target
train_df = df[df['train_test_80_20'] == 'train'].sort_values(by=['ticker', 'fixed_quarter_date'])
test_df = df[df['train_test_80_20'] == 'test'].sort_values(by=['ticker', 'fixed_quarter_date'])
train_numeric_X = train_df[numeric_feature_columns]
train_cat_X = train_df[cat_feature_columns]
test_numeric_X = test_df[numeric_feature_columns]
test_cat_X = test_df[cat_feature_columns]
X_train = pd.concat([train_numeric_X, train_cat_X], axis=1)
X_test = pd.concat([test_numeric_X, test_cat_X], axis=1)
y_train =  train_df[target_column].map(custom_mapping)
y_test = test_df[target_column].map(custom_mapping)

# Preprocessing
numeric_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_feature_columns),
        ('cat', cat_transformer, cat_feature_columns)
    ]
)
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

print('feature names: ')
print(preprocessor.get_feature_names_out())
feature_names = preprocessor.get_feature_names_out()

feature names: 
['num__EBIT' 'num__common_plus_preferred_stock' 'num__workingCapital'
 'num__Ratio_A' 'num__Ratio_B' 'num__Ratio_C' 'num__Ratio_D'
 'num__Ratio_E' 'num__Positiv' 'num__Negativ' 'num__Strong' 'num__Weak'
 'num__Active' 'num__Passive' 'num__Ovrst' 'num__Undrst'
 'num__cashAndCashEquivalents' 'num__shortTermInvestments'
 'num__cashAndShortTermInvestments' 'num__netReceivables'
 'num__inventory_balance_sheet' 'num__otherCurrentAssets'
 'num__totalCurrentAssets' 'num__propertyPlantEquipmentNet'
 'num__goodwill' 'num__intangibleAssets'
 'num__goodwillAndIntangibleAssets' 'num__longTermInvestments'
 'num__taxAssets' 'num__otherNonCurrentAssets'
 'num__totalNonCurrentAssets' 'num__otherAssets' 'num__totalAssets'
 'num__accountPayables' 'num__shortTermDebt' 'num__taxPayables'
 'num__deferredRevenue' 'num__otherCurrentLiabilities'
 'num__totalCurrentLiabilities' 'num__longTermDebt'
 'num__deferredRevenueNonCurrent' 'num__deferredTaxLiabilitiesNonCurrent'
 'num__otherNonCurrentLia

## Retrain Model

### Logistic Regression

In [6]:
# Load hyperparameters
lr_hyperparameters = joblib.load('../../../Output/Modelling/Logistic Regression/exclude_previous_rating_model_3/exclude_previous_rating_model_3_best_params.pkl')
print('Hyperparameters: ')
print(lr_hyperparameters)

# Define logistic regression model
lr_model = LogisticRegression()

# Fit model
lr_model.set_params(**lr_hyperparameters)
lr_model.fit(X_train_scaled, y_train)

Hyperparameters: 
{'C': 0.1, 'class_weight': 'balanced', 'l1_ratio': 1.0, 'multi_class': 'ovr', 'penalty': 'elasticnet', 'solver': 'saga'}


In [7]:
# Model prediction
y_pred = lr_model.predict(X_test_scaled)
print(len(y_pred))

# Output predictions
output_df = test_df[['ticker', 'fixed_quarter_date', target_column]].copy().reset_index(drop=True)
output_df['prediction'] = pd.Series(y_pred).map({value: key for key, value in custom_mapping.items()})
output_df.to_excel('../../../Data/Predictions/Graph Neural Network/Other Classifiers on GNN Data/lr_retrain_predictions.xlsx', index=False)
print(output_df)

# Accuracy and majority class share baseline
accuracy = round(accuracy_score(y_test, y_pred), 4)
print('Accuracy: ', accuracy)
stattotex(accuracy, 'retrainLRAccuracy', '../../../Output/Modelling/Graph Neural Network/other_classifiers_on_gnn_data.tex')
majority_class_share_baseline = y_test.value_counts(normalize=True).max()
print('Majority class share baseline: ', majority_class_share_baseline)

980
    ticker fixed_quarter_date Rating prediction
0     AAPL         2016-07-01     AA         AA
1     ABBV         2015-04-01      A         AA
2     ABBV         2016-04-01      A         AA
3      ABC         2012-04-01      A          A
4      ABC         2013-01-01      A          A
..     ...                ...    ...        ...
975    XOM         2016-01-01    AAA        AAA
976    YUM         2015-04-01    BBB         AA
977   ZBRA         2016-10-01      B        CCC
978    ZTS         2013-10-01    BBB        BBB
979    ZTS         2014-04-01    BBB        BBB

[980 rows x 4 columns]
Accuracy:  0.6398
Majority class share baseline:  0.32755102040816325


### XGBoost

In [8]:
# Set hyperparameters
xgb_hyperparameters = {'booster': 'gbtree', 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 20, 'min_child_weight': 5, 'n_estimators': 1000, 'objective': 'multi:softprob'}

# Define the XGBoost model
xgb_model = xgb.XGBClassifier(num_class=len(set(y_test)), n_jobs=-1)

# Fit model
xgb_model.set_params(**xgb_hyperparameters)
xgb_model.fit(X_train_scaled, y_train)

In [9]:
# Model prediction
y_pred = xgb_model.predict(X_test_scaled)
print(len(y_pred))

# Output predictions
output_df = test_df[['ticker', 'fixed_quarter_date', target_column]].copy().reset_index(drop=True)
output_df['prediction'] = pd.Series(y_pred).map({value: key for key, value in custom_mapping.items()})
output_df.to_excel('../../../Data/Predictions/Graph Neural Network/Other Classifiers on GNN Data/xgb_retrain_predictions.xlsx', index=False)
print(output_df)

# Accuracy and majority class share baseline
accuracy = round(accuracy_score(y_test, y_pred), 4)
print('Accuracy: ', accuracy)
stattotex(accuracy, 'retrainXGBAccuracy', '../../../Output/Modelling/Graph Neural Network/other_classifiers_on_gnn_data.tex')
majority_class_share_baseline = y_test.value_counts(normalize=True).max()
print('Majority class share baseline: ', majority_class_share_baseline)

980
    ticker fixed_quarter_date Rating prediction
0     AAPL         2016-07-01     AA         AA
1     ABBV         2015-04-01      A          A
2     ABBV         2016-04-01      A          A
3      ABC         2012-04-01      A          A
4      ABC         2013-01-01      A          A
..     ...                ...    ...        ...
975    XOM         2016-01-01    AAA        AAA
976    YUM         2015-04-01    BBB        BBB
977   ZBRA         2016-10-01      B          B
978    ZTS         2013-10-01    BBB        BBB
979    ZTS         2014-04-01    BBB        BBB

[980 rows x 4 columns]
Accuracy:  0.9031
Majority class share baseline:  0.32755102040816325


## Pre-Trained Model

In [10]:
# Load predictions and target file
pred_and_targets = pd.read_excel('../../../Data/Predictions/Logistic Regression/exclude_previous_rating_model_3/exclude_previous_rating_model_3_predictions.xlsx')

# Inner join with test_df on ticker and fixed_quarter_date
# Cast to string
pred_and_targets['fixed_quarter_date'] = pred_and_targets['fixed_quarter_date'].astype(str)
test_df['fixed_quarter_date'] = test_df['fixed_quarter_date'].astype(str)
pred_and_targets = pd.merge(pred_and_targets, test_df[['ticker', 'fixed_quarter_date']], on=['ticker', 'fixed_quarter_date'], how='inner')
print(pred_and_targets)

# Save to '../../../Data/Predictions/Graph Neural Network/Other Classifiers on GNN Data/lr_pretrained_predictions.xlsx'
pred_and_targets.to_excel('../../../Data/Predictions/Graph Neural Network/Other Classifiers on GNN Data/lr_pretrained_predictions.xlsx', index=False)

# Accuracy
accuracy = round(accuracy_score(pred_and_targets['Rating'], pred_and_targets['exclude_previous_rating_model_3_predictions']), 4)
print('Accuracy: ', accuracy)
stattotex(accuracy, 'pretrainedLRAccuracy', '../../../Output/Modelling/Graph Neural Network/other_classifiers_on_gnn_data.tex')

    ticker fixed_quarter_date Rating  \
0     AAPL         2016-07-01     AA   
1     ABBV         2015-04-01      A   
2     ABBV         2016-04-01      A   
3      ABC         2012-04-01      A   
4      ABC         2013-01-01      A   
..     ...                ...    ...   
975    XOM         2016-01-01    AAA   
976    YUM         2015-04-01    BBB   
977   ZBRA         2016-10-01      B   
978    ZTS         2013-10-01    BBB   
979    ZTS         2014-04-01    BBB   

    exclude_previous_rating_model_3_predictions  
0                                            AA  
1                                            AA  
2                                           AAA  
3                                             A  
4                                             A  
..                                          ...  
975                                         AAA  
976                                          AA  
977                                           B  
978                

### XGBoost

In [11]:
# Load predictions and target file
pred_and_targets = pd.read_excel('../../../Data/Predictions/XGBoost/exclude_previous_rating_model_3/exclude_previous_rating_model_3_predictions.xlsx')

# Inner join with test_df on ticker and fixed_quarter_date
# Cast to string
pred_and_targets['fixed_quarter_date'] = pred_and_targets['fixed_quarter_date'].astype(str)
test_df['fixed_quarter_date'] = test_df['fixed_quarter_date'].astype(str)
pred_and_targets = pd.merge(pred_and_targets, test_df[['ticker', 'fixed_quarter_date']], on=['ticker', 'fixed_quarter_date'], how='inner')
print(pred_and_targets)

# Save to '../../../Data/Predictions/Graph Neural Network/Other Classifiers on GNN Data/xgb_pretrained_predictions.xlsx'
pred_and_targets.to_excel('../../../Data/Predictions/Graph Neural Network/Other Classifiers on GNN Data/xgb_pretrained_predictions.xlsx', index=False)

# Accuracy
accuracy = round(accuracy_score(pred_and_targets['Rating'], pred_and_targets['exclude_previous_rating_model_3_predictions']), 4)
print('Accuracy: ', accuracy)
stattotex(accuracy, 'pretrainedXGBAccuracy', '../../../Output/Modelling/Graph Neural Network/other_classifiers_on_gnn_data.tex')

    ticker fixed_quarter_date Rating  \
0     AAPL         2016-07-01     AA   
1     ABBV         2015-04-01      A   
2     ABBV         2016-04-01      A   
3      ABC         2012-04-01      A   
4      ABC         2013-01-01      A   
..     ...                ...    ...   
975    XOM         2016-01-01    AAA   
976    YUM         2015-04-01    BBB   
977   ZBRA         2016-10-01      B   
978    ZTS         2013-10-01    BBB   
979    ZTS         2014-04-01    BBB   

    exclude_previous_rating_model_3_predictions  
0                                            AA  
1                                             A  
2                                             A  
3                                             A  
4                                             A  
..                                          ...  
975                                         AAA  
976                                          BB  
977                                           B  
978                