# Autogluon Tabular

Use the Autogluon AutoML library to predict ratings using tabular data locally (mostly just financial statement variables).

In [3]:
# Packages
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
import os

## Load Data

In [4]:
# list of files in '../../../Data/All_Data/All_Data_Fixed_Quarter_Dates'
file_list = [f for f in os.listdir(r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates') if f.endswith('.parquet')]
# read in all parquet files
df = pd.concat([pd.read_parquet(r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates/' + f) for f in file_list])
print('dataframe')
print(df)

dataframe
    ticker fixed_quarter_date earnings_call_date Rating  \
0     AAPL         2014-07-01         2014-04-23     AA   
1     AAPL         2014-10-01         2014-07-22     AA   
2     AAPL         2015-01-01         2014-10-20     AA   
3     AAPL         2015-04-01         2015-01-27     AA   
4     AAPL         2015-07-01         2015-04-27     AA   
..     ...                ...                ...    ...   
912    ZTS         2015-10-01         2015-08-04    BBB   
913    ZTS         2016-01-01         2015-11-03    BBB   
914    ZTS         2016-04-01         2016-02-16    BBB   
915    ZTS         2016-07-01         2016-05-04    BBB   
916    ZTS         2016-10-01         2016-08-03    BBB   

                     Rating Agency Name rating_date      CR_source  \
0    Standard & Poor's Ratings Services  2014-05-27  Supplementary   
1    Standard & Poor's Ratings Services  2014-05-27  Supplementary   
2    Standard & Poor's Ratings Services  2014-05-27  Supplementary   
3

In [5]:
# Print out column names
print('column names')
for col in df.columns:
    print(col)

column names
ticker
fixed_quarter_date
earnings_call_date
Rating
Rating Agency Name
rating_date
CR_source
Rating Rank AAA is 10
Next Rating
Next Rating Date
Previous Rating
Previous Rating Date
next_rating_date_or_end_of_data
credit_rating_year
previous_fixed_quarter_date
days_since_call_on_fixed_quarter
quarter
calls_year
transcript
Calls_source
date
symbol
reportedCurrency
cik
fillingDate
acceptedDate
calendarYear
period
cashAndCashEquivalents
shortTermInvestments
cashAndShortTermInvestments
netReceivables
inventory
otherCurrentAssets
totalCurrentAssets
propertyPlantEquipmentNet
goodwill
intangibleAssets
goodwillAndIntangibleAssets
longTermInvestments
taxAssets
otherNonCurrentAssets
totalNonCurrentAssets
otherAssets
totalAssets
accountPayables
shortTermDebt
taxPayables
deferredRevenue
otherCurrentLiabilities
totalCurrentLiabilities
longTermDebt
deferredRevenueNonCurrent
deferredTaxLiabilitiesNonCurrent
otherNonCurrentLiabilities
totalNonCurrentLiabilities
otherLiabilities
capitalLeas

In [6]:
# Removing columns: 'Rating Rank AAA is 10', 'transcript', 'Investment_Grade', 'Change Direction Since Last Fixed Quarter Date', 'Change Since Last Fixed Quarter Date', 'Next Rating', 'Next Rating Date', 'next_rating_date_or_end_of_data'
df = df.drop(columns=['Rating Rank AAA is 10', 
                      'transcript', 
                      'Investment_Grade', 
                      'Change Direction Since Last Fixed Quarter Date', 
                      'Change Since Last Fixed Quarter Date', 
                      'Next Rating', 
                      'Next Rating Date', 
                      'next_rating_date_or_end_of_data'])

## Deploy AutoGluon

In [7]:
# Split into train and test datasets
train_df = df[df['train_test_80_20'] == 'train'].reset_index(drop=True)
test_df = df[df['train_test_80_20'] == 'test'].reset_index(drop=True)

In [8]:
# Convert from pandas to autogluon
train_data = TabularDataset(train_df)
test_data = TabularDataset(test_df)

In [9]:
# Train and Test
predictor = TabularPredictor(label='Rating').fit(train_data=train_data)
predictions = predictor.predict(test_data)

No path specified. Models will be saved in: "AutogluonModels\ag-20240321_064453"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240321_064453"
AutoGluon Version:  1.0.0
Python Version:     3.11.8
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          8
Memory Avail:       7.38 GB / 15.68 GB (47.1%)
Disk Space Avail:   18.61

In [10]:
# Evaluation
predictor.evaluate(test_data, silent=True)

{'accuracy': 0.9343326195574589,
 'balanced_accuracy': 0.7448524291356307,
 'mcc': 0.915772066074118}

In [11]:
# Leaderboard of models
predictor.leaderboard(test_data)

  df = df.fillna(column_fills, inplace=False, downcast=False)


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost,0.940757,0.927365,accuracy,0.06901,0.051668,325.680288,0.06901,0.051668,325.680288,1,True,8
1,RandomForestEntr,0.937188,0.925676,accuracy,0.152995,0.177409,11.334002,0.152995,0.177409,11.334002,1,True,7
2,RandomForestGini,0.936474,0.925676,accuracy,0.187886,0.108412,6.938527,0.187886,0.108412,6.938527,1,True,6
3,ExtraTreesGini,0.936474,0.923986,accuracy,0.187901,0.131354,3.045713,0.187901,0.131354,3.045713,1,True,9
4,ExtraTreesEntr,0.934333,0.927365,accuracy,0.181964,0.129039,2.834588,0.181964,0.129039,2.834588,1,True,10
5,WeightedEnsemble_L2,0.934333,0.9375,accuracy,0.577101,0.222644,32.820059,0.010105,0.0,0.615799,2,True,14
6,LightGBMXT,0.93005,0.929054,accuracy,0.379095,0.09129,29.158547,0.379095,0.09129,29.158547,1,True,4
7,NeuralNetFastAI,0.927909,0.927365,accuracy,0.17948,0.057967,30.040244,0.17948,0.057967,30.040244,1,True,3
8,LightGBM,0.927909,0.912162,accuracy,0.194926,0.048954,29.52114,0.194926,0.048954,29.52114,1,True,5
9,XGBoost,0.92434,0.912162,accuracy,0.178447,0.050456,44.95817,0.178447,0.050456,44.95817,1,True,11


In [12]:
# Feature importance
fi = predictor.feature_importance(test_data)

These features in provided data are not utilized by the predictor and will be ignored: ['Rating Agency Name', 'Calls_source', 'symbol', 'reportedCurrency', 'calendarYear', 'period', 'totalLiabilitiesAndTotalEquity', 'cik_cash_flow_statement', 'operatingCashFlow', 'cik_income_statement', 'Company Name', 'train_test_80_20']
Computing feature importance via permutation shuffling for 146 features using 1395 rows with 5 shuffle sets...
	836.82s	= Expected runtime (167.36s per shuffle set)
	593.79s	= Actual runtime (Completed 5 of 5 shuffle sets)


In [14]:
# Print entire df
pd.set_option('display.max_rows', None)
fi

Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
rating_on_previous_fixed_quarter_date AAA is 10,0.02207885,0.005204,0.000344,5,0.032794,0.011364
Description,0.01204301,0.002174,0.000122,5,0.01652,0.007566
rating_on_previous_fixed_quarter_date,0.009318996,0.003324,0.001652,5,0.016163,0.002475
rating_date,0.006451613,0.001963,0.000913,5,0.010494,0.002409
Comment,0.005878136,0.001711,0.000773,5,0.009402,0.002354
Previous Rating Date,0.005017921,0.002912,0.009125,5,0.011013,-0.000978
Previous Rating,0.003584229,0.000717,0.000182,5,0.00506,0.002108
interestExpense,0.001863799,0.000393,0.000223,5,0.002672,0.001055
interestIncome,0.00172043,0.000641,0.001941,5,0.003041,0.0004
taxPayables,0.00172043,0.000393,0.000304,5,0.002529,0.000912
