## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import sqlite3




In [2]:
connection = sqlite3.connect('../data/db.sqlite')

# Import DB into pandas dataframe
df = pd.read_sql_query("SELECT * FROM ratings", connection)

connection.close()

df.head()

Unnamed: 0,Rating Agency,Corporation,Rating,Rating Date,CIK,Binary Rating,SIC Code,Sector,Ticker,Current Ratio,...,EBITDA Margin,Pre-Tax Profit Margin,Net Profit Margin,Asset Turnover,ROE - Return On Equity,Return On Tangible Equity,ROA - Return On Assets,ROI - Return On Investment,Operating Cash Flow Per Share,Free Cash Flow Per Share
0,Standard & Poor's Ratings Services,American States Water Co.,A-,2010-07-30,1056903,1,4941.0,Utils,AWR,1.1507,...,28.9834,13.6093,8.3224,0.3173,8.1724,8.1978,2.6385,4.453,1.9957,-0.1333
1,Standard & Poor's Ratings Services,Automatic Data Processing Inc.,AAA,2010-09-16,8670,1,7374.0,BusEq,ADP,1.1129,...,23.9379,20.8699,13.569,0.3324,22.0354,47.2858,4.4944,21.8765,0.2501,0.3132
2,Standard & Poor's Ratings Services,Avnet Inc.,BBB-,2010-11-23,8858,1,5065.0,Shops,AVT,1.9276,...,3.6338,3.0536,2.1418,2.462,13.6376,16.7991,5.2731,9.6494,-7.6079,-7.3231
3,Standard & Poor's Ratings Services,California Water Service Co.,AA-,2010-06-29,1035201,1,4941.0,Utils,CWT,0.8358,...,27.9377,15.1135,9.0246,0.2946,9.6412,9.7015,2.6583,5.1018,1.7438,-0.8999
4,Standard & Poor's Ratings Services,Cardinal Health Inc.,A,2010-07-14,721371,1,5122.0,Shops,CAH,1.2931,...,1.5847,1.2304,0.6518,4.9276,11.1256,19.4184,2.9364,8.1844,1.9725,2.4174


In [3]:
df.columns

Index(['Rating Agency', 'Corporation', 'Rating', 'Rating Date', 'CIK',
       'Binary Rating', 'SIC Code', 'Sector', 'Ticker', 'Current Ratio',
       'Long-term Debt / Capital', 'Debt/Equity Ratio', 'Gross Margin',
       'Operating Margin', 'EBIT Margin', 'EBITDA Margin',
       'Pre-Tax Profit Margin', 'Net Profit Margin', 'Asset Turnover',
       'ROE - Return On Equity', 'Return On Tangible Equity',
       'ROA - Return On Assets', 'ROI - Return On Investment',
       'Operating Cash Flow Per Share', 'Free Cash Flow Per Share'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7805 entries, 0 to 7804
Data columns (total 25 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Rating Agency                  7805 non-null   object 
 1   Corporation                    7805 non-null   object 
 2   Rating                         7805 non-null   object 
 3   Rating Date                    7805 non-null   object 
 4   CIK                            7805 non-null   int64  
 5   Binary Rating                  7805 non-null   int64  
 6   SIC Code                       7805 non-null   float64
 7   Sector                         7805 non-null   object 
 8   Ticker                         7805 non-null   object 
 9   Current Ratio                  7805 non-null   float64
 10  Long-term Debt / Capital       7805 non-null   float64
 11  Debt/Equity Ratio              7805 non-null   float64
 12  Gross Margin                   7805 non-null   f

In [5]:
df['Rating'].value_counts()

Rating
BBB     910
BBB+    846
A       836
A-      722
BBB-    705
BB+     545
A+      478
BB      453
BB-     427
B+      422
B       310
B-      288
AA-     254
AA      185
CCC+    151
AAA      90
AA+      73
CCC      65
CC       18
C        11
CCC-     10
D         5
CC+       1
Name: count, dtype: int64

In [6]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
reduced_df = df.drop(columns=['Corporation', 'Rating', 'CIK', 'Ticker'])
reduced_df.head()

Unnamed: 0,Rating Agency,Rating Date,Binary Rating,SIC Code,Sector,Current Ratio,Long-term Debt / Capital,Debt/Equity Ratio,Gross Margin,Operating Margin,...,EBITDA Margin,Pre-Tax Profit Margin,Net Profit Margin,Asset Turnover,ROE - Return On Equity,Return On Tangible Equity,ROA - Return On Assets,ROI - Return On Investment,Operating Cash Flow Per Share,Free Cash Flow Per Share
0,Standard & Poor's Ratings Services,2010-07-30,1,4941.0,Utils,1.1507,0.4551,0.8847,77.623,19.4839,...,28.9834,13.6093,8.3224,0.3173,8.1724,8.1978,2.6385,4.453,1.9957,-0.1333
1,Standard & Poor's Ratings Services,2010-09-16,1,7374.0,BusEq,1.1129,0.0072,0.0073,43.6619,19.8327,...,23.9379,20.8699,13.569,0.3324,22.0354,47.2858,4.4944,21.8765,0.2501,0.3132
2,Standard & Poor's Ratings Services,2010-11-23,1,5065.0,Shops,1.9276,0.2924,0.4255,11.9008,3.3173,...,3.6338,3.0536,2.1418,2.462,13.6376,16.7991,5.2731,9.6494,-7.6079,-7.3231
3,Standard & Poor's Ratings Services,2010-06-29,1,4941.0,Utils,0.8358,0.4708,0.9491,64.5096,18.4549,...,27.9377,15.1135,9.0246,0.2946,9.6412,9.7015,2.6583,5.1018,1.7438,-0.8999
4,Standard & Poor's Ratings Services,2010-07-14,1,5122.0,Shops,1.2931,0.2644,0.4036,3.8385,1.3269,...,1.5847,1.2304,0.6518,4.9276,11.1256,19.4184,2.9364,8.1844,1.9725,2.4174


In [7]:
# Determine the number of unique values in each column.
reduced_df.nunique()


Rating Agency                       7
Rating Date                      1414
Binary Rating                       2
SIC Code                          240
Sector                             12
Current Ratio                    2521
Long-term Debt / Capital         2241
Debt/Equity Ratio                2484
Gross Margin                     2601
Operating Margin                 2648
EBIT Margin                      2648
EBITDA Margin                    2649
Pre-Tax Profit Margin            2649
Net Profit Margin                2642
Asset Turnover                   2424
ROE - Return On Equity           2651
Return On Tangible Equity        2648
ROA - Return On Assets           2632
ROI - Return On Investment       2641
Operating Cash Flow Per Share    2590
Free Cash Flow Per Share         2585
dtype: int64

In [8]:
reduced_df.columns

Index(['Rating Agency', 'Rating Date', 'Binary Rating', 'SIC Code', 'Sector',
       'Current Ratio', 'Long-term Debt / Capital', 'Debt/Equity Ratio',
       'Gross Margin', 'Operating Margin', 'EBIT Margin', 'EBITDA Margin',
       'Pre-Tax Profit Margin', 'Net Profit Margin', 'Asset Turnover',
       'ROE - Return On Equity', 'Return On Tangible Equity',
       'ROA - Return On Assets', 'ROI - Return On Investment',
       'Operating Cash Flow Per Share', 'Free Cash Flow Per Share'],
      dtype='object')

In [9]:
# Used code from https://stackoverflow.com/questions/14247586/how-to-select-rows-with-one-or-more-nulls-from-a-pandas-dataframe-without-listin
# Look for nulls
def nans(df): return df[df.isnull().any(axis=1)]
nans(reduced_df)


Unnamed: 0,Rating Agency,Rating Date,Binary Rating,SIC Code,Sector,Current Ratio,Long-term Debt / Capital,Debt/Equity Ratio,Gross Margin,Operating Margin,...,EBITDA Margin,Pre-Tax Profit Margin,Net Profit Margin,Asset Turnover,ROE - Return On Equity,Return On Tangible Equity,ROA - Return On Assets,ROI - Return On Investment,Operating Cash Flow Per Share,Free Cash Flow Per Share


In [10]:
# Checking for NA's
def nans2(df): return df[df.isna().any(axis=1)]
nans2(reduced_df)

Unnamed: 0,Rating Agency,Rating Date,Binary Rating,SIC Code,Sector,Current Ratio,Long-term Debt / Capital,Debt/Equity Ratio,Gross Margin,Operating Margin,...,EBITDA Margin,Pre-Tax Profit Margin,Net Profit Margin,Asset Turnover,ROE - Return On Equity,Return On Tangible Equity,ROA - Return On Assets,ROI - Return On Investment,Operating Cash Flow Per Share,Free Cash Flow Per Share


In [11]:
# # Look at APPLICATION_TYPE value counts for binning
# application_counts = reduced_df['APPLICATION_TYPE'].value_counts()
# application_counts

In [12]:
# # Choose a cutoff value and create a list of application types to be replaced
# # use the variable name `application_types_to_replace`
# application_types_to_replace = list(application_counts[application_counts < 528].index)

# # Replace in dataframe
# for app in application_types_to_replace:
#     reduced_df['APPLICATION_TYPE'] = reduced_df['APPLICATION_TYPE'].replace(app,"Other")

# # Check to make sure binning was successful
# reduced_df['APPLICATION_TYPE'].value_counts()

In [13]:
# # Look at CLASSIFICATION value counts for binning
# classification_counts = reduced_df['CLASSIFICATION'].value_counts()
# classification_counts

In [14]:
# # You may find it helpful to look at CLASSIFICATION value counts >1
# class_counts_over1 = classification_counts.loc[classification_counts > 1]
# class_counts_over1

In [15]:
# # Choose a cutoff value and create a list of classifications to be replaced
# # use the variable name `classifications_to_replace`
# classifications_to_replace = list(classification_counts[classification_counts < 1000].index)

# # Replace in dataframe
# for cls in classifications_to_replace:
#     reduced_df['CLASSIFICATION'] = reduced_df['CLASSIFICATION'].replace(cls,"Other")

# # Check to make sure binning was successful
# reduced_df['CLASSIFICATION'].value_counts()

In [16]:
# Convert categorical data to numeric with `pd.get_dummies`
reduced_numeric = pd.get_dummies(reduced_df)

In [17]:
reduced_numeric.head()

Unnamed: 0,Binary Rating,SIC Code,Current Ratio,Long-term Debt / Capital,Debt/Equity Ratio,Gross Margin,Operating Margin,EBIT Margin,EBITDA Margin,Pre-Tax Profit Margin,...,Sector_Durbl,Sector_Enrgy,Sector_Hlth,Sector_Manuf,Sector_Money,Sector_NoDur,Sector_Other,Sector_Shops,Sector_Telcm,Sector_Utils
0,1,4941.0,1.1507,0.4551,0.8847,77.623,19.4839,19.4839,28.9834,13.6093,...,False,False,False,False,False,False,False,False,False,True
1,1,7374.0,1.1129,0.0072,0.0073,43.6619,19.8327,19.8327,23.9379,20.8699,...,False,False,False,False,False,False,False,False,False,False
2,1,5065.0,1.9276,0.2924,0.4255,11.9008,3.3173,3.3173,3.6338,3.0536,...,False,False,False,False,False,False,False,True,False,False
3,1,4941.0,0.8358,0.4708,0.9491,64.5096,18.4549,18.4549,27.9377,15.1135,...,False,False,False,False,False,False,False,False,False,True
4,1,5122.0,1.2931,0.2644,0.4036,3.8385,1.3269,1.3269,1.5847,1.2304,...,False,False,False,False,False,False,False,True,False,False


In [18]:
reduced_df.head()

Unnamed: 0,Rating Agency,Rating Date,Binary Rating,SIC Code,Sector,Current Ratio,Long-term Debt / Capital,Debt/Equity Ratio,Gross Margin,Operating Margin,...,EBITDA Margin,Pre-Tax Profit Margin,Net Profit Margin,Asset Turnover,ROE - Return On Equity,Return On Tangible Equity,ROA - Return On Assets,ROI - Return On Investment,Operating Cash Flow Per Share,Free Cash Flow Per Share
0,Standard & Poor's Ratings Services,2010-07-30,1,4941.0,Utils,1.1507,0.4551,0.8847,77.623,19.4839,...,28.9834,13.6093,8.3224,0.3173,8.1724,8.1978,2.6385,4.453,1.9957,-0.1333
1,Standard & Poor's Ratings Services,2010-09-16,1,7374.0,BusEq,1.1129,0.0072,0.0073,43.6619,19.8327,...,23.9379,20.8699,13.569,0.3324,22.0354,47.2858,4.4944,21.8765,0.2501,0.3132
2,Standard & Poor's Ratings Services,2010-11-23,1,5065.0,Shops,1.9276,0.2924,0.4255,11.9008,3.3173,...,3.6338,3.0536,2.1418,2.462,13.6376,16.7991,5.2731,9.6494,-7.6079,-7.3231
3,Standard & Poor's Ratings Services,2010-06-29,1,4941.0,Utils,0.8358,0.4708,0.9491,64.5096,18.4549,...,27.9377,15.1135,9.0246,0.2946,9.6412,9.7015,2.6583,5.1018,1.7438,-0.8999
4,Standard & Poor's Ratings Services,2010-07-14,1,5122.0,Shops,1.2931,0.2644,0.4036,3.8385,1.3269,...,1.5847,1.2304,0.6518,4.9276,11.1256,19.4184,2.9364,8.1844,1.9725,2.4174


In [19]:
# Split our preprocessed data into our features and target arrays
X = reduced_numeric.drop(['Binary Rating'], axis=1)
y = reduced_numeric['Binary Rating']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [20]:
reduced_numeric.head()

Unnamed: 0,Binary Rating,SIC Code,Current Ratio,Long-term Debt / Capital,Debt/Equity Ratio,Gross Margin,Operating Margin,EBIT Margin,EBITDA Margin,Pre-Tax Profit Margin,...,Sector_Durbl,Sector_Enrgy,Sector_Hlth,Sector_Manuf,Sector_Money,Sector_NoDur,Sector_Other,Sector_Shops,Sector_Telcm,Sector_Utils
0,1,4941.0,1.1507,0.4551,0.8847,77.623,19.4839,19.4839,28.9834,13.6093,...,False,False,False,False,False,False,False,False,False,True
1,1,7374.0,1.1129,0.0072,0.0073,43.6619,19.8327,19.8327,23.9379,20.8699,...,False,False,False,False,False,False,False,False,False,False
2,1,5065.0,1.9276,0.2924,0.4255,11.9008,3.3173,3.3173,3.6338,3.0536,...,False,False,False,False,False,False,False,True,False,False
3,1,4941.0,0.8358,0.4708,0.9491,64.5096,18.4549,18.4549,27.9377,15.1135,...,False,False,False,False,False,False,False,False,False,True
4,1,5122.0,1.2931,0.2644,0.4036,3.8385,1.3269,1.3269,1.5847,1.2304,...,False,False,False,False,False,False,False,True,False,False


In [21]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


## Compile, Train and Evaluate the Model

In [22]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 3
hidden_nodes_layer2 = 3
hidden_nodes_layer3 = 3
hidden_nodes_layer4 = 2

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,
             input_dim=number_input_features, activation="sigmoid"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(
    units=hidden_nodes_layer2, activation="sigmoid"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(
    units=hidden_nodes_layer3, activation="sigmoid"))

# nn.add(tf.keras.layers.Dense(
#     units=hidden_nodes_layer4, activation="sigmoid"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 3)                 4353      
                                                                 
 dense_1 (Dense)             (None, 3)                 12        
                                                                 
 dense_2 (Dense)             (None, 3)                 12        
                                                                 
 dense_3 (Dense)             (None, 1)                 4         
                                                                 
Total params: 4,381
Trainable params: 4,381
Non-trainable params: 0
_________________________________________________________________


In [23]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [24]:
# Train the model
# Used code from https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint
# and from https://stackoverflow.com/questions/44886509/keras-save-checkpoints

EPOCHS = 75
checkpoint_filepath = '../models/model3/checkpoints/weights.epoch_{epoch:02d}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_freq=5,
    verbose=True)

# Model weights are saved at the end of every epoch, if it's the best seen
# so far.
fit_model = nn.fit(X_train_scaled, y_train, epochs=EPOCHS, callbacks=[model_checkpoint_callback])



Epoch 1/75


2024-03-16 10:35:44.709185: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


  1/183 [..............................] - ETA: 1:02 - loss: 0.8661 - accuracy: 0.2812
Epoch 1: saving model to ../models/model3/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model3/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model3/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model3/checkpoints/weights.epoch_01.hdf5
 20/183 [==>...........................] - ETA: 0s - loss: 0.8199 - accuracy: 0.3438  
Epoch 1: saving model to ../models/model3/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model3/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model3/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model3/checkpoints/weights.epoch_01.hdf5
Epoch 1: saving model to ../models/model3/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model3/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model3/checkpoints/weig

In [25]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

61/61 - 0s - loss: 0.7262 - accuracy: 0.7182 - 175ms/epoch - 3ms/step
Loss: 0.7262084484100342, Accuracy: 0.7182376980781555


In [26]:
# Export our model to HDF5 file
nn.save('../models/model3/model.h5')
