# Evaluate Autogluon Multimodal SCF

Use the Autogluon AutoML library to predict ratings using both tabular data and earnings call transcripts.

In [1]:
# Set model name to evaluate
model_name =

In [2]:
# Packages
import pandas as pd
from autogluon.multimodal import MultiModalPredictor
import os
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Set seed
np.random.seed(222)

## Load Data

In [3]:
# list of files in '../../../Data/All_Data/All_Data_Fixed_Quarter_Dates'
file_list = [f for f in os.listdir(r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates') if f.endswith('.parquet')]
# read in all parquet files
df = pd.concat([pd.read_parquet(r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates/' + f) for f in file_list])
print('dataframe')
print(df)

dataframe
    ticker fixed_quarter_date earnings_call_date Rating  \
0     AAPL         2014-07-01         2014-04-23     AA   
1     AAPL         2014-10-01         2014-07-22     AA   
2     AAPL         2015-01-01         2014-10-20     AA   
3     AAPL         2015-04-01         2015-01-27     AA   
4     AAPL         2015-07-01         2015-04-27     AA   
..     ...                ...                ...    ...   
912    ZTS         2015-10-01         2015-08-04    BBB   
913    ZTS         2016-01-01         2015-11-03    BBB   
914    ZTS         2016-04-01         2016-02-16    BBB   
915    ZTS         2016-07-01         2016-05-04    BBB   
916    ZTS         2016-10-01         2016-08-03    BBB   

                     Rating Agency Name rating_date      CR_source  \
0    Standard & Poor's Ratings Services  2014-05-27  Supplementary   
1    Standard & Poor's Ratings Services  2014-05-27  Supplementary   
2    Standard & Poor's Ratings Services  2014-05-27  Supplementary   
3

In [4]:
# Print out column names
print('column names')
for col in df.columns:
    print(col)

column names
ticker
fixed_quarter_date
earnings_call_date
Rating
Rating Agency Name
rating_date
CR_source
Rating Rank AAA is 10
Next Rating
Next Rating Date
Previous Rating
Previous Rating Date
next_rating_date_or_end_of_data
credit_rating_year
previous_fixed_quarter_date
days_since_call_on_fixed_quarter
quarter
calls_year
transcript
Calls_source
date
symbol
reportedCurrency
cik
fillingDate
acceptedDate
calendarYear
period
cashAndCashEquivalents
shortTermInvestments
cashAndShortTermInvestments
netReceivables
inventory
otherCurrentAssets
totalCurrentAssets
propertyPlantEquipmentNet
goodwill
intangibleAssets
goodwillAndIntangibleAssets
longTermInvestments
taxAssets
otherNonCurrentAssets
totalNonCurrentAssets
otherAssets
totalAssets
accountPayables
shortTermDebt
taxPayables
deferredRevenue
otherCurrentLiabilities
totalCurrentLiabilities
longTermDebt
deferredRevenueNonCurrent
deferredTaxLiabilitiesNonCurrent
otherNonCurrentLiabilities
totalNonCurrentLiabilities
otherLiabilities
capitalLeas

In [5]:
# Removing columns: 'Rating Rank AAA is 10', 'Investment_Grade', 'Change Direction Since Last Fixed Quarter Date', 'Change Since Last Fixed Quarter Date', 'Next Rating', 'Next Rating Date', 'next_rating_date_or_end_of_data'
df = df.drop(columns=['Rating Rank AAA is 10', 
                      'Investment_Grade', 
                      'Change Direction Since Last Fixed Quarter Date', 
                      'Change Since Last Fixed Quarter Date', 
                      'Next Rating', 
                      'Next Rating Date', 
                      'next_rating_date_or_end_of_data'])

In [6]:
# Get test df
test_df = df[df['train_test_80_20'] == 'test'].reset_index(drop=True)

## Load Model

In [7]:
# Load model
predictor = MultiModalPredictor.load('AutogluonModels/' + model_name)
predictor

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x1f4a8c2ca50>

## Make Predictions

In [9]:
# Apply test
predictions = predictor.predict(test_df)
# Concatenate with test data values of 'ticker' and 'fixed_quarter_date'
# Use index values to line up
predictions = pd.concat([test_df[['ticker', 'fixed_quarter_date']], predictions], axis=1)
# Save to Excel
predictions.to_excel('../../../Data/Predictions/Autogluon/' + model_name + '_predictions.xlsx', index=False)
predictions

Unnamed: 0,ticker,fixed_quarter_date,Rating
0,AAPL,2016-10-01,AA
1,ABB,2014-10-01,A
2,ABB,2015-01-01,A
3,ABBV,2015-07-01,A
4,ABBV,2016-04-01,A
...,...,...,...
1396,XRAY,2015-10-01,A
1397,XRAY,2016-07-01,A
1398,XRAY,2016-10-01,A
1399,YUM,2016-07-01,BB


## Evaluation and Leaderboard

In [10]:
# Evaluation
predictor.evaluate(test_df, silent=True)

{'accuracy': 0.9343326195574589,
 'balanced_accuracy': 0.7448524291356307,
 'mcc': 0.915772066074118}

In [11]:
# Leaderboard of models
# Not sure this will work
try:
    leaderboard = predictor.leaderboard(test_df)
    # Save to Excel
    leaderboard.to_excel('../../../Output/Modelling/Autogluon/' + model_name + '_leaderboard.xlsx', index=False)
    leaderboard
except:
    print('Leaderboard not available')
    pass

  df = df.fillna(column_fills, inplace=False, downcast=False)


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost,0.940757,0.927365,accuracy,0.07284,0.051668,325.680288,0.07284,0.051668,325.680288,1,True,8
1,RandomForestEntr,0.937188,0.925676,accuracy,0.148539,0.177409,11.334002,0.148539,0.177409,11.334002,1,True,7
2,ExtraTreesGini,0.936474,0.923986,accuracy,0.141466,0.131354,3.045713,0.141466,0.131354,3.045713,1,True,9
3,RandomForestGini,0.936474,0.925676,accuracy,0.173774,0.108412,6.938527,0.173774,0.108412,6.938527,1,True,6
4,ExtraTreesEntr,0.934333,0.927365,accuracy,0.14849,0.129039,2.834588,0.14849,0.129039,2.834588,1,True,10
5,WeightedEnsemble_L2,0.934333,0.9375,accuracy,0.498446,0.222644,32.820059,0.008834,0.0,0.615799,2,True,14
6,LightGBMXT,0.93005,0.929054,accuracy,0.348146,0.09129,29.158547,0.348146,0.09129,29.158547,1,True,4
7,LightGBM,0.927909,0.912162,accuracy,0.183751,0.048954,29.52114,0.183751,0.048954,29.52114,1,True,5
8,NeuralNetFastAI,0.927909,0.927365,accuracy,2.368931,0.057967,30.040244,2.368931,0.057967,30.040244,1,True,3
9,XGBoost,0.92434,0.912162,accuracy,0.205361,0.050456,44.95817,0.205361,0.050456,44.95817,1,True,11


In [None]:
# Keep columns model, score_test and output to LaTeX
# Rename to 'Model' and 'Test Accuracy'
try:
    leaderboard[['model', 'score_test']].rename(columns={'model': 'Model', 'score_test': 'Test Accuracy'}).to_latex('../../../Output/Modelling/Autogluon/' + model_name + '_leaderboard.tex', index=False)
except:
    print('Leaderboard not available')
    pass

## Hyperparameters

In [12]:
# Model info including hyperparameters
# Not sure this will work
try:
    pred_info = predictor.info()
    # Get model hyperparameters
    list_of_models = pred_info['model_info'].keys()
    # List of dataframes to fill
    list_of_dfs = []
    # Iterate over models
    for model in list_of_models:
        # Get hyperparameters
        hyperparameters = pred_info['model_info'][model]['hyperparameters']
        # Convert to dataframe
        df = pd.DataFrame.from_dict(hyperparameters, orient='index')
        # Add model name
        df['model'] = model
        # Append to list
        list_of_dfs.append(df)
    # Concatenate all dataframes
    hyperparameters_df = pd.concat(list_of_dfs).reset_index().rename(columns={'index': 'hyperparameter', 0: 'value'})[['model', 'hyperparameter', 'value']]
    # Save to Excel
    hyperparameters_df.to_excel('../../../Output/Modelling/Autogluon/' + model_name + '_hyperparameters.xlsx', index=False)
    hyperparameters_df
except:
    print('Hyperparameters not available')
    pass


Unnamed: 0,model,hyperparameter,value
0,KNeighborsUnif,weights,uniform
1,KNeighborsDist,weights,distance
2,NeuralNetFastAI,layers,
3,NeuralNetFastAI,emb_drop,0.1
4,NeuralNetFastAI,ps,0.1
...,...,...,...
75,LightGBMLarge,min_data_in_leaf,3.0
76,WeightedEnsemble_L2,use_orig_features,False
77,WeightedEnsemble_L2,max_base_models,25
78,WeightedEnsemble_L2,max_base_models_per_type,5


## Feature Importance via Permutation

In [None]:
# Feature importance
# Not sure this will work
try:
    # Feature importance
    fi = predictor.feature_importance(test_df)
    # Save to Excel
    fi.to_excel('../../../Output/Modelling/Autogluon/' + model_name + '_feature_importance.xlsx', index=False)
    # Output 10 most important items to LaTeX
    # Rename importance to 'Average Drop in Accuracy'
    # Rename stddev to 'Standard Deviation'
    # Rename pvalue to 'P-Value'
    fi.reset_index().rename(columns={'index': 'feature'})[['feature', 'importance', 'stddev', 'p_value']].rename(columns={'feature': 'Feature', 'importance': 'Average Drop in Accuracy', 'stddev': 'Standard Deviation', 'p_value': 'P-Value'}).head(10).to_latex('../../../Output/Modelling/Autogluon/' + model_name + '_feature_importance.tex', index=False)
    # Print entire df
    pd.set_option('display.max_rows', None)
    fi
except:
    print('Feature importance not available')
    pass