# Other Classifiers on GNN Data

In [1]:
# Packages
import pandas as pd
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib
from stattotex import *
# Kill warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Set model name
model_name = 'exclude_previous_rating_model_3'

## Load Data

In [3]:
# Load data
df = pd.read_excel('../../../Data/Learnable Network/feature_and_class_df.xlsx')
df

Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,rating_date,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,next_rating_date_or_end_of_data,...,Ovrst,Undrst,PN,SW,AP,OU,tone,num_q_by_len,train_test_80_20,node
0,AAPL,2014-10-01,2014-07-22,AA,2014-05-27,AA,2015-02-18,AAA,2014-04-24,2015-02-18,...,364,131,5.518519,15.261905,2.661290,2.778626,3.188264,0.003822,train,272
1,AAPL,2015-01-01,2014-10-20,AA,2014-05-27,AA,2015-02-18,AAA,2014-04-24,2015-02-18,...,465,152,5.348485,15.934783,3.296482,3.059211,3.681858,0.002766,train,1499
2,AAPL,2015-04-01,2015-01-27,AA,2015-02-18,AA,2015-05-28,AA,2014-05-27,2015-05-28,...,468,151,3.927711,8.113636,2.841346,3.099338,1.307366,0.004628,train,814
3,AAPL,2015-07-01,2015-04-27,AA,2015-06-02,AA,2015-08-25,AA,2015-05-28,2015-08-25,...,415,135,5.250000,9.142857,2.640187,3.074074,2.025933,0.003861,train,1888
4,AAPL,2015-10-01,2015-07-21,AA,2015-08-25,AA,2016-05-20,AA,2015-06-02,2016-05-20,...,449,148,4.209877,10.442857,2.579909,3.033784,1.815531,0.003915,train,1224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1959,ZBRA,2016-01-01,2015-11-10,B,2015-09-03,B,2016-09-26,,,2016-09-26,...,431,202,2.662338,13.586667,2.768293,2.133663,0.982796,0.005321,train,369
1960,ZBRA,2016-04-01,2016-02-25,B,2015-09-03,B,2016-09-26,,,2016-09-26,...,449,173,3.991071,13.574713,2.382550,2.595376,1.925784,0.002704,train,839
1961,ZBRA,2016-07-01,2016-05-10,B,2015-09-03,B,2016-09-26,,,2016-09-26,...,394,197,2.671053,8.080357,2.446970,2.000000,-0.423271,0.002775,train,448
1962,ZBRA,2016-10-01,2016-08-09,B,2016-09-26,,,B,2015-09-03,2016-12-31,...,403,183,3.625954,12.151163,3.066116,2.202186,1.210391,0.003215,test,1286


## Get Column Names

In [4]:
# Load variable index excel file
variable_index = pd.read_excel('../../../Variable Index.xlsx')

# Model name column
# Clean model name is 'Rating Model' plus the number (last character)
clean_model_name = 'Rating Model ' + model_name[-1]

# Numeric features
# Values of column_name where clean_model_name is X, and Data Type is Numeric
numeric_feature_columns = variable_index[(variable_index[clean_model_name] == 'X') & (variable_index['Data Type'] == 'Numeric')]['column_name'].tolist()
# Categorical features
# Values of column_name where clean_model_name is X, and Data Type is not Numeric
cat_feature_columns = variable_index[(variable_index[clean_model_name] == 'X') & (variable_index['Data Type'] != 'Numeric')]['column_name'].tolist()

# Target column
# Values of column_name where column called model_name is Y
target_column = variable_index[variable_index[clean_model_name] == 'Y']['column_name'].values[0]

# Mapping for target column
custom_mapping = {'AAA': 0, 'AA': 1, 'A': 2, 'BBB': 3, 'BB': 4, 'B': 5, 'CCC': 6, "CC": 7, "C": 8, "D": 9}
# Drop items in custom_mapping that are not in the data
for key in list(custom_mapping.keys()):
    if key not in df[target_column].unique():
        del custom_mapping[key]
# Reset values to be range of length
custom_mapping = {key: value for value, key in enumerate(custom_mapping.keys())}
custom_mapping

{'AAA': 0,
 'AA': 1,
 'A': 2,
 'BBB': 3,
 'BB': 4,
 'B': 5,
 'CCC': 6,
 'C': 7,
 'D': 8}

## Prepare Matrices

In [5]:
# Selecting features and target, and encoding target
train_df = df[df['train_test_80_20'] == 'train'].sort_values(by=['ticker', 'fixed_quarter_date'])
test_df = df[df['train_test_80_20'] == 'test'].sort_values(by=['ticker', 'fixed_quarter_date'])
train_numeric_X = train_df[numeric_feature_columns]
train_cat_X = train_df[cat_feature_columns]
test_numeric_X = test_df[numeric_feature_columns]
test_cat_X = test_df[cat_feature_columns]
X_train = pd.concat([train_numeric_X, train_cat_X], axis=1)
X_test = pd.concat([test_numeric_X, test_cat_X], axis=1)
y_train =  train_df[target_column].map(custom_mapping)
y_test = test_df[target_column].map(custom_mapping)

# Preprocessing
numeric_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_feature_columns),
        ('cat', cat_transformer, cat_feature_columns)
    ]
)
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

print('feature names: ')
print(preprocessor.get_feature_names_out())
feature_names = preprocessor.get_feature_names_out()

feature names: 
['num__EBIT' 'num__common_plus_preferred_stock' 'num__workingCapital'
 'num__Ratio_A' 'num__Ratio_B' 'num__Ratio_C' 'num__Ratio_D'
 'num__Ratio_E' 'num__Positiv' 'num__Negativ' 'num__Strong' 'num__Weak'
 'num__Active' 'num__Passive' 'num__Ovrst' 'num__Undrst'
 'num__cashAndCashEquivalents' 'num__shortTermInvestments'
 'num__cashAndShortTermInvestments' 'num__netReceivables'
 'num__inventory_balance_sheet' 'num__otherCurrentAssets'
 'num__totalCurrentAssets' 'num__propertyPlantEquipmentNet'
 'num__goodwill' 'num__intangibleAssets'
 'num__goodwillAndIntangibleAssets' 'num__longTermInvestments'
 'num__taxAssets' 'num__otherNonCurrentAssets'
 'num__totalNonCurrentAssets' 'num__otherAssets' 'num__totalAssets'
 'num__accountPayables' 'num__shortTermDebt' 'num__taxPayables'
 'num__deferredRevenue' 'num__otherCurrentLiabilities'
 'num__totalCurrentLiabilities' 'num__longTermDebt'
 'num__deferredRevenueNonCurrent' 'num__deferredTaxLiabilitiesNonCurrent'
 'num__otherNonCurrentLia

## Retrain Model

### Logistic Regression

In [6]:
# Load hyperparameters
hyperparameters = joblib.load('../../../Output/Modelling/Logistic Regression/exclude_previous_rating_model_3/exclude_previous_rating_model_3_best_params.pkl')
print('Hyperparameters: ')
print(hyperparameters)

# Define logistic regression model
model = LogisticRegression()

# Fit model
model.set_params(**hyperparameters)
model.fit(X_train_scaled, y_train)

# Model prediction
y_pred = model.predict(X_test_scaled)

# Accuracy and majority class share baseline
accuracy = round(accuracy_score(y_test, y_pred), 4)
print('Accuracy: ', accuracy)
stattotex(accuracy, 'retrainLRAccuracy', '../../../Output/Modelling/Graph Neural Network/other_classifiers_on_gnn_data.tex')
majority_class_share_baseline = y_test.value_counts(normalize=True).max()
print('Majority class share baseline: ', majority_class_share_baseline)

Hyperparameters: 
{'C': 0.1, 'class_weight': 'balanced', 'l1_ratio': 1.0, 'multi_class': 'ovr', 'penalty': 'elasticnet', 'solver': 'saga'}
Accuracy:  0.6244
Majority class share baseline:  0.29850746268656714


### XGBoost

In [7]:
# Set hyperparameters
hyperparameters = {'booster': 'gbtree', 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 20, 'min_child_weight': 5, 'n_estimators': 1000, 'objective': 'multi:softprob'}

# Define the XGBoost model
model = xgb.XGBClassifier(num_class=len(set(y_test)), n_jobs=-1)

# Fit model
model.set_params(**hyperparameters)
model.fit(X_train_scaled, y_train)

# Model prediction
y_pred = model.predict(X_test_scaled)

# Accuracy and majority class share baseline
accuracy = round(accuracy_score(y_test, y_pred), 4)
print('Accuracy: ', accuracy)
stattotex(accuracy, 'retrainXGBAccuracy', '../../../Output/Modelling/Graph Neural Network/other_classifiers_on_gnn_data.tex')
majority_class_share_baseline = y_test.value_counts(normalize=True).max()
print('Majority class share baseline: ', majority_class_share_baseline)

Accuracy:  0.8209
Majority class share baseline:  0.29850746268656714


## Pre-Trained Model

In [8]:
# Load predictions and target file
pred_and_targets = pd.read_excel('../../../Data/Predictions/Logistic Regression/exclude_previous_rating_model_3/exclude_previous_rating_model_3_predictions.xlsx')

# Inner join with test_df on ticker and fixed_quarter_date
# Cast to string
pred_and_targets['fixed_quarter_date'] = pred_and_targets['fixed_quarter_date'].astype(str)
test_df['fixed_quarter_date'] = test_df['fixed_quarter_date'].astype(str)
pred_and_targets = pd.merge(pred_and_targets, test_df[['ticker', 'fixed_quarter_date']], on=['ticker', 'fixed_quarter_date'], how='inner')
print(pred_and_targets)

# Accuracy
accuracy = round(accuracy_score(pred_and_targets['Rating'], pred_and_targets['exclude_previous_rating_model_3_predictions']), 4)
print('Accuracy: ', accuracy)
stattotex(accuracy, 'pretrainedLRAccuracy', '../../../Output/Modelling/Graph Neural Network/other_classifiers_on_gnn_data.tex')

    ticker fixed_quarter_date Rating  \
0     AAPL         2016-07-01     AA   
1     ABBV         2015-04-01      A   
2     ABBV         2016-04-01      A   
3      ABC         2013-04-01      A   
4      ABC         2013-07-01      A   
..     ...                ...    ...   
397    XEL         2016-07-01      A   
398    XEL         2016-10-01      A   
399    XOM         2016-01-01    AAA   
400    YUM         2015-04-01    BBB   
401   ZBRA         2016-10-01      B   

    exclude_previous_rating_model_3_predictions  
0                                            AA  
1                                            AA  
2                                           AAA  
3                                             A  
4                                             A  
..                                          ...  
397                                         BBB  
398                                           A  
399                                         AAA  
400                

### XGBoost

In [9]:
# Load predictions and target file
pred_and_targets = pd.read_excel('../../../Data/Predictions/XGBoost/exclude_previous_rating_model_3/exclude_previous_rating_model_3_predictions.xlsx')

# Inner join with test_df on ticker and fixed_quarter_date
# Cast to string
pred_and_targets['fixed_quarter_date'] = pred_and_targets['fixed_quarter_date'].astype(str)
test_df['fixed_quarter_date'] = test_df['fixed_quarter_date'].astype(str)
pred_and_targets = pd.merge(pred_and_targets, test_df[['ticker', 'fixed_quarter_date']], on=['ticker', 'fixed_quarter_date'], how='inner')
print(pred_and_targets)

# Accuracy
accuracy = round(accuracy_score(pred_and_targets['Rating'], pred_and_targets['exclude_previous_rating_model_3_predictions']), 4)
print('Accuracy: ', accuracy)
stattotex(accuracy, 'pretrainedXGBAccuracy', '../../../Output/Modelling/Graph Neural Network/other_classifiers_on_gnn_data.tex')

    ticker fixed_quarter_date Rating  \
0     AAPL         2016-07-01     AA   
1     ABBV         2015-04-01      A   
2     ABBV         2016-04-01      A   
3      ABC         2013-04-01      A   
4      ABC         2013-07-01      A   
..     ...                ...    ...   
397    XEL         2016-07-01      A   
398    XEL         2016-10-01      A   
399    XOM         2016-01-01    AAA   
400    YUM         2015-04-01    BBB   
401   ZBRA         2016-10-01      B   

    exclude_previous_rating_model_3_predictions  
0                                            AA  
1                                             A  
2                                             A  
3                                             A  
4                                             A  
..                                          ...  
397                                           A  
398                                           A  
399                                         AAA  
400                