# Evaluate Autogluon Tabular Only SCF

Use the Autogluon AutoML library to predict ratings using tabular data locally (mostly just financial statement variables).

In [13]:
# Set model name to evaluate
model_name = 'ag-20240331_233723'

In [14]:
# Packages
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
import os

## Load Data

In [15]:
# list of files in '../../../Data/All_Data/All_Data_with_NLP_Features' directory
file_list = [f for f in os.listdir(r'../../../Data/All_Data/All_Data_with_NLP_Features') if f.endswith('.parquet')]
# read in all parquet files
df = pd.concat([pd.read_parquet(r'../../../Data/All_Data/All_Data_with_NLP_Features/' + f) for f in file_list])
print('dataframe')
print(df)

dataframe
    ticker fixed_quarter_date earnings_call_date Rating rating_date  \
0     AAPL         2014-10-01         2014-07-22     AA  2014-05-27   
1     AAPL         2015-01-01         2014-10-20     AA  2014-05-27   
2     AAPL         2015-04-01         2015-01-27     AA  2015-02-18   
3     AAPL         2015-07-01         2015-04-27     AA  2015-06-02   
4     AAPL         2015-10-01         2015-07-21     AA  2015-08-25   
..     ...                ...                ...    ...         ...   
468    ZTS         2015-10-01         2015-08-04    BBB  2015-01-30   
469    ZTS         2016-01-01         2015-11-03    BBB  2015-11-03   
470    ZTS         2016-04-01         2016-02-16    BBB  2016-01-22   
471    ZTS         2016-07-01         2016-05-04    BBB  2016-01-22   
472    ZTS         2016-10-01         2016-08-03    BBB  2016-01-22   

     Rating Rank AAA is 10 Next Rating Next Rating Date Previous Rating  \
0                        9          AA       2015-02-18       

In [16]:
# Print out column names
print('column names')
for col in df.columns:
    print(col)

column names
ticker
fixed_quarter_date
earnings_call_date
Rating
rating_date
Rating Rank AAA is 10
Next Rating
Next Rating Date
Previous Rating
Previous Rating Date
next_rating_date_or_end_of_data
credit_rating_year
previous_fixed_quarter_date
days_since_call_on_fixed_quarter
days_since_rating
for_quarter
for_year
transcript
reportedCurrency
acceptedDate_balance_sheet
cashAndCashEquivalents
shortTermInvestments
cashAndShortTermInvestments
netReceivables
inventory_balance_sheet
otherCurrentAssets
totalCurrentAssets
propertyPlantEquipmentNet
goodwill
intangibleAssets
goodwillAndIntangibleAssets
longTermInvestments
taxAssets
otherNonCurrentAssets
totalNonCurrentAssets
otherAssets
totalAssets
accountPayables
shortTermDebt
taxPayables
deferredRevenue
otherCurrentLiabilities
totalCurrentLiabilities
longTermDebt
deferredRevenueNonCurrent
deferredTaxLiabilitiesNonCurrent
otherNonCurrentLiabilities
totalNonCurrentLiabilities
otherLiabilities
capitalLeaseObligations
totalLiabilities
preferredSto

In [17]:
# Removing columns: 'Rating Rank AAA is 10', 'transcript', 'Investment_Grade', 'Change Direction Since Last Fixed Quarter Date', 'Change Since Last Fixed Quarter Date', 'Next Rating', 'Next Rating Date', 'next_rating_date_or_end_of_data'
df = df.drop(columns=['Rating Rank AAA is 10', 
                      'transcript', 
                      'Investment_Grade', 
                      'Change Direction Since Last Fixed Quarter Date', 
                      'Change Since Last Fixed Quarter Date', 
                      'Next Rating', 
                      'Next Rating Date', 
                      'next_rating_date_or_end_of_data'])

In [18]:
# Get test df
test_df = df[df['train_test_80_20'] == 'test'].reset_index(drop=True)

## Load Model

In [19]:
# Load model
predictor = TabularPredictor.load('AutogluonModels/' + model_name)
predictor

Found 1 mismatches between original and current metadata:


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x1b1f71273d0>

## Make Predictions

In [20]:
# Convert from pandas to autogluon
test_data = TabularDataset(test_df)

In [21]:
# Apply test
predictions = predictor.predict(test_data)
# Concatenate with test data values of 'ticker' and 'fixed_quarter_date'
# Use index values to line up
predictions = pd.concat([test_df[['ticker', 'fixed_quarter_date']], predictions], axis=1)
# Save to Excel
predictions.to_excel('../../../Data/Predictions/Autogluon/' + model_name + '_predictions.xlsx', index=False)
predictions

Unnamed: 0,ticker,fixed_quarter_date,Rating
0,AAPL,2016-10-01,AA
1,ABC,2013-01-01,A
2,ABC,2013-10-01,A
3,ABG,2015-10-01,BB
4,ABG,2016-07-01,BB
...,...,...,...
893,YUM,2014-10-01,BBB
894,YUM,2015-04-01,BBB
895,ZTS,2013-10-01,BBB
896,ZTS,2014-10-01,BBB


## Evaluation and Leaderboard

In [22]:
# Windows path fix
import pathlib
temp = pathlib.PosixPath
pathlib.PosixPath = pathlib.WindowsPath

In [23]:
# Evaluation
predictor.evaluate(test_data, silent=True)

{'accuracy': 0.9276169265033407,
 'balanced_accuracy': 0.8125894790057449,
 'mcc': 0.9067760733084685}

In [24]:
# Leaderboard of models
leaderboard = predictor.leaderboard(test_data)
# Save to Excel
leaderboard.to_excel('../../../Output/Modelling/Autogluon/' + model_name + '_leaderboard.xlsx', index=False)
leaderboard

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\ijyli\anaconda3\envs\computervision\Lib\site-packages\IPython\core\interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\ijyli\AppData\Local\Temp\ipykernel_17340\3309450358.py", line 2, in <module>
    leaderboard = predictor.leaderboard(test_data)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ijyli\anaconda3\envs\computervision\Lib\site-packages\autogluon\tabular\predictor\predictor.py", line 2324, in leaderboard
    return self._learner.leaderboard(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ijyli\anaconda3\envs\computervision\Lib\site-packages\autogluon\tabular\learner\abstract_learner.py", line 849, in leaderboard
    leaderboard = self.score_debug(
                  ^^^^^^^^^^^^^^^^^
  File "c:\Users\ijyli\anaconda3\envs\computervision\Lib\site-packages\autogluon\tabular\learner\abstract_learner.py", line 551, in score_debug
    scores

In [None]:
# Keep columns model, score_test and output to LaTeX
# Rename to 'Model' and 'Test Accuracy'
leaderboard[['model', 'score_test']].rename(columns={'model': 'Model', 'score_test': 'Test Accuracy'}).to_latex('../../../Output/Modelling/Autogluon/' + model_name + '_leaderboard.tex', index=False)

## Hyperparameters

In [None]:
# Model info including hyperparameters
pred_info = predictor.info()
# Get model hyperparameters
list_of_models = pred_info['model_info'].keys()
# List of dataframes to fill
list_of_dfs = []
# Iterate over models
for model in list_of_models:
    # Get hyperparameters
    hyperparameters = pred_info['model_info'][model]['hyperparameters']
    # Convert to dataframe
    df = pd.DataFrame.from_dict(hyperparameters, orient='index')
    # Add model name
    df['model'] = model
    # Append to list
    list_of_dfs.append(df)
# Concatenate all dataframes
hyperparameters_df = pd.concat(list_of_dfs).reset_index().rename(columns={'index': 'hyperparameter', 0: 'value'})[['model', 'hyperparameter', 'value']]
# Save to Excel
hyperparameters_df.to_excel('../../../Output/Modelling/Autogluon/' + model_name + '_hyperparameters.xlsx', index=False)
hyperparameters_df


Unnamed: 0,model,hyperparameter,value
0,KNeighborsUnif,weights,uniform
1,KNeighborsDist,weights,distance
2,NeuralNetFastAI,layers,
3,NeuralNetFastAI,emb_drop,0.1
4,NeuralNetFastAI,ps,0.1
...,...,...,...
75,LightGBMLarge,min_data_in_leaf,3.0
76,WeightedEnsemble_L2,use_orig_features,False
77,WeightedEnsemble_L2,max_base_models,25
78,WeightedEnsemble_L2,max_base_models_per_type,5


## Feature Importance via Permutation

In [None]:
# Feature importance
fi = predictor.feature_importance(test_data)

These features in provided data are not utilized by the predictor and will be ignored: ['Rating Agency Name', 'Calls_source', 'symbol', 'reportedCurrency', 'calendarYear', 'period', 'totalLiabilitiesAndTotalEquity', 'cik_cash_flow_statement', 'operatingCashFlow', 'cik_income_statement', 'Company Name', 'train_test_80_20']


In [None]:
# Save to Excel
fi.to_excel('../../../Output/Modelling/Autogluon/' + model_name + '_feature_importance.xlsx', index=False)

In [None]:
# Output 10 most important items to LaTeX
# Rename importance to 'Average Drop in Accuracy'
# Rename stddev to 'Standard Deviation'
# Rename pvalue to 'P-Value'
fi.reset_index().rename(columns={'index': 'feature'})[['feature', 'importance', 'stddev', 'p_value']].rename(columns={'feature': 'Feature', 'importance': 'Average Drop in Accuracy', 'stddev': 'Standard Deviation', 'p_value': 'P-Value'}).head(10).to_latex('../../../Output/Modelling/Autogluon/' + model_name + '_feature_importance.tex', index=False)

In [None]:
# Print entire df
pd.set_option('display.max_rows', None)
fi

Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
rating_on_previous_fixed_quarter_date AAA is 10,0.02207885,0.005204,0.000344,5,0.032794,0.011364
Description,0.01204301,0.002174,0.000122,5,0.01652,0.007566
rating_on_previous_fixed_quarter_date,0.009318996,0.003324,0.001652,5,0.016163,0.002475
rating_date,0.006451613,0.001963,0.000913,5,0.010494,0.002409
Comment,0.005878136,0.001711,0.000773,5,0.009402,0.002354
Previous Rating Date,0.005017921,0.002912,0.009125,5,0.011013,-0.000978
Previous Rating,0.003584229,0.000717,0.000182,5,0.00506,0.002108
interestExpense,0.001863799,0.000393,0.000223,5,0.002672,0.001055
interestIncome,0.00172043,0.000641,0.001941,5,0.003041,0.0004
taxPayables,0.00172043,0.000393,0.000304,5,0.002529,0.000912
