## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import sqlite3




In [2]:
connection = sqlite3.connect('../data/db.sqlite')

# Import DB into pandas dataframe
df = pd.read_sql_query("SELECT * FROM ratings", connection)

connection.close()

df.head()

Unnamed: 0,Rating Agency,Corporation,Rating,Rating Date,CIK,Binary Rating,SIC Code,Sector,Ticker,Current Ratio,...,EBITDA Margin,Pre-Tax Profit Margin,Net Profit Margin,Asset Turnover,ROE - Return On Equity,Return On Tangible Equity,ROA - Return On Assets,ROI - Return On Investment,Operating Cash Flow Per Share,Free Cash Flow Per Share
0,Standard & Poor's Ratings Services,American States Water Co.,A-,2010-07-30,1056903,1,4941.0,Utils,AWR,1.1507,...,28.9834,13.6093,8.3224,0.3173,8.1724,8.1978,2.6385,4.453,1.9957,-0.1333
1,Standard & Poor's Ratings Services,Automatic Data Processing Inc.,AAA,2010-09-16,8670,1,7374.0,BusEq,ADP,1.1129,...,23.9379,20.8699,13.569,0.3324,22.0354,47.2858,4.4944,21.8765,0.2501,0.3132
2,Standard & Poor's Ratings Services,Avnet Inc.,BBB-,2010-11-23,8858,1,5065.0,Shops,AVT,1.9276,...,3.6338,3.0536,2.1418,2.462,13.6376,16.7991,5.2731,9.6494,-7.6079,-7.3231
3,Standard & Poor's Ratings Services,California Water Service Co.,AA-,2010-06-29,1035201,1,4941.0,Utils,CWT,0.8358,...,27.9377,15.1135,9.0246,0.2946,9.6412,9.7015,2.6583,5.1018,1.7438,-0.8999
4,Standard & Poor's Ratings Services,Cardinal Health Inc.,A,2010-07-14,721371,1,5122.0,Shops,CAH,1.2931,...,1.5847,1.2304,0.6518,4.9276,11.1256,19.4184,2.9364,8.1844,1.9725,2.4174


In [3]:
# Drop columns that won't be needed for the model
# More columns may be dropped later
reduced_df = df.drop(columns=['Corporation', 'CIK', 'Ticker'])
reduced_df.head()

Unnamed: 0,Rating Agency,Rating,Rating Date,Binary Rating,SIC Code,Sector,Current Ratio,Long-term Debt / Capital,Debt/Equity Ratio,Gross Margin,...,EBITDA Margin,Pre-Tax Profit Margin,Net Profit Margin,Asset Turnover,ROE - Return On Equity,Return On Tangible Equity,ROA - Return On Assets,ROI - Return On Investment,Operating Cash Flow Per Share,Free Cash Flow Per Share
0,Standard & Poor's Ratings Services,A-,2010-07-30,1,4941.0,Utils,1.1507,0.4551,0.8847,77.623,...,28.9834,13.6093,8.3224,0.3173,8.1724,8.1978,2.6385,4.453,1.9957,-0.1333
1,Standard & Poor's Ratings Services,AAA,2010-09-16,1,7374.0,BusEq,1.1129,0.0072,0.0073,43.6619,...,23.9379,20.8699,13.569,0.3324,22.0354,47.2858,4.4944,21.8765,0.2501,0.3132
2,Standard & Poor's Ratings Services,BBB-,2010-11-23,1,5065.0,Shops,1.9276,0.2924,0.4255,11.9008,...,3.6338,3.0536,2.1418,2.462,13.6376,16.7991,5.2731,9.6494,-7.6079,-7.3231
3,Standard & Poor's Ratings Services,AA-,2010-06-29,1,4941.0,Utils,0.8358,0.4708,0.9491,64.5096,...,27.9377,15.1135,9.0246,0.2946,9.6412,9.7015,2.6583,5.1018,1.7438,-0.8999
4,Standard & Poor's Ratings Services,A,2010-07-14,1,5122.0,Shops,1.2931,0.2644,0.4036,3.8385,...,1.5847,1.2304,0.6518,4.9276,11.1256,19.4184,2.9364,8.1844,1.9725,2.4174


In [4]:
# Used code from https://stackoverflow.com/questions/14247586/how-to-select-rows-with-one-or-more-nulls-from-a-pandas-dataframe-without-listin
# Look for nulls
def nans(df): return df[df.isnull().any(axis=1)]
nans(reduced_df)


Unnamed: 0,Rating Agency,Rating,Rating Date,Binary Rating,SIC Code,Sector,Current Ratio,Long-term Debt / Capital,Debt/Equity Ratio,Gross Margin,...,EBITDA Margin,Pre-Tax Profit Margin,Net Profit Margin,Asset Turnover,ROE - Return On Equity,Return On Tangible Equity,ROA - Return On Assets,ROI - Return On Investment,Operating Cash Flow Per Share,Free Cash Flow Per Share


In [5]:
# Checking for NA's
def nans2(df): return df[df.isna().any(axis=1)]
nans2(reduced_df)

Unnamed: 0,Rating Agency,Rating,Rating Date,Binary Rating,SIC Code,Sector,Current Ratio,Long-term Debt / Capital,Debt/Equity Ratio,Gross Margin,...,EBITDA Margin,Pre-Tax Profit Margin,Net Profit Margin,Asset Turnover,ROE - Return On Equity,Return On Tangible Equity,ROA - Return On Assets,ROI - Return On Investment,Operating Cash Flow Per Share,Free Cash Flow Per Share


In [6]:
# Checking
reduced_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7805 entries, 0 to 7804
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Rating Agency                  7805 non-null   object 
 1   Rating                         7805 non-null   object 
 2   Rating Date                    7805 non-null   object 
 3   Binary Rating                  7805 non-null   int64  
 4   SIC Code                       7805 non-null   float64
 5   Sector                         7805 non-null   object 
 6   Current Ratio                  7805 non-null   float64
 7   Long-term Debt / Capital       7805 non-null   float64
 8   Debt/Equity Ratio              7805 non-null   float64
 9   Gross Margin                   7805 non-null   float64
 10  Operating Margin               7805 non-null   float64
 11  EBIT Margin                    7805 non-null   float64
 12  EBITDA Margin                  7805 non-null   f

In [7]:
# Using Rating might be a form of over fitting but trying to see if there's a way to bin it
# What are the distribution of Ratings?
reduced_df['Rating'].value_counts()

Rating
BBB     910
BBB+    846
A       836
A-      722
BBB-    705
BB+     545
A+      478
BB      453
BB-     427
B+      422
B       310
B-      288
AA-     254
AA      185
CCC+    151
AAA      90
AA+      73
CCC      65
CC       18
C        11
CCC-     10
D         5
CC+       1
Name: count, dtype: int64

In [8]:
# Dictionary mapping ratings to numbers
rating_mapping = {'AAA': 1, 'AA+': 1, 'AA': 2, 'AA-': 2, 'A+': 3, 'A': 3, 'A-': 4, 'BBB+': 5, 'BBB': 6, 'BBB-': 7, 'BB+': 8, 'BB': 8, 'BB-': 9, 'B+': 8, 'B': 10, 'B-': 10, \
                  'C': 11, 'CCC': 11, 'CC': 11, 'CC+': 11, 'CCC-': 11, 'CCC+': 11, 'D': 12}

# Replace ratings with numbers in reduced_df
reduced_df['Rating'] = reduced_df['Rating'].replace(rating_mapping)

# Double Check
reduced_df['Rating'].value_counts()

Rating
8     1420
3     1314
6      910
5      846
4      722
7      705
10     598
2      439
9      427
11     256
1      163
12       5
Name: count, dtype: int64

In [9]:
reduced_df['Rating'].head()

0    4
1    1
2    7
3    2
4    3
Name: Rating, dtype: int64

In [10]:
# Convert categorical data to numeric with `pd.get_dummies`
reduced_numeric = pd.get_dummies(reduced_df)

In [11]:
reduced_numeric.head()

Unnamed: 0,Rating,Binary Rating,SIC Code,Current Ratio,Long-term Debt / Capital,Debt/Equity Ratio,Gross Margin,Operating Margin,EBIT Margin,EBITDA Margin,...,Sector_Durbl,Sector_Enrgy,Sector_Hlth,Sector_Manuf,Sector_Money,Sector_NoDur,Sector_Other,Sector_Shops,Sector_Telcm,Sector_Utils
0,4,1,4941.0,1.1507,0.4551,0.8847,77.623,19.4839,19.4839,28.9834,...,False,False,False,False,False,False,False,False,False,True
1,1,1,7374.0,1.1129,0.0072,0.0073,43.6619,19.8327,19.8327,23.9379,...,False,False,False,False,False,False,False,False,False,False
2,7,1,5065.0,1.9276,0.2924,0.4255,11.9008,3.3173,3.3173,3.6338,...,False,False,False,False,False,False,False,True,False,False
3,2,1,4941.0,0.8358,0.4708,0.9491,64.5096,18.4549,18.4549,27.9377,...,False,False,False,False,False,False,False,False,False,True
4,3,1,5122.0,1.2931,0.2644,0.4036,3.8385,1.3269,1.3269,1.5847,...,False,False,False,False,False,False,False,True,False,False
