## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import sqlite3




In [2]:
connection = sqlite3.connect('../data/db.sqlite')

# Import DB into pandas dataframe
df = pd.read_sql_query("SELECT * FROM ratings", connection)

connection.close()

df.head()

Unnamed: 0,Rating Agency,Corporation,Rating,Rating Date,CIK,Binary Rating,SIC Code,Sector,Ticker,Current Ratio,...,EBITDA Margin,Pre-Tax Profit Margin,Net Profit Margin,Asset Turnover,ROE - Return On Equity,Return On Tangible Equity,ROA - Return On Assets,ROI - Return On Investment,Operating Cash Flow Per Share,Free Cash Flow Per Share
0,Standard & Poor's Ratings Services,American States Water Co.,A-,2010-07-30,1056903,1,4941.0,Utils,AWR,1.1507,...,28.9834,13.6093,8.3224,0.3173,8.1724,8.1978,2.6385,4.453,1.9957,-0.1333
1,Standard & Poor's Ratings Services,Automatic Data Processing Inc.,AAA,2010-09-16,8670,1,7374.0,BusEq,ADP,1.1129,...,23.9379,20.8699,13.569,0.3324,22.0354,47.2858,4.4944,21.8765,0.2501,0.3132
2,Standard & Poor's Ratings Services,Avnet Inc.,BBB-,2010-11-23,8858,1,5065.0,Shops,AVT,1.9276,...,3.6338,3.0536,2.1418,2.462,13.6376,16.7991,5.2731,9.6494,-7.6079,-7.3231
3,Standard & Poor's Ratings Services,California Water Service Co.,AA-,2010-06-29,1035201,1,4941.0,Utils,CWT,0.8358,...,27.9377,15.1135,9.0246,0.2946,9.6412,9.7015,2.6583,5.1018,1.7438,-0.8999
4,Standard & Poor's Ratings Services,Cardinal Health Inc.,A,2010-07-14,721371,1,5122.0,Shops,CAH,1.2931,...,1.5847,1.2304,0.6518,4.9276,11.1256,19.4184,2.9364,8.1844,1.9725,2.4174


In [3]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
reduced_df = df.drop(columns=['Corporation', 'CIK', 'Ticker', 'Rating Agency', 'Rating Date', 'SIC Code',
       'Sector', 'Current Ratio', 'Long-term Debt / Capital',
        'Gross Margin', 'Operating Margin', 'EBIT Margin',
       'EBITDA Margin', 'Pre-Tax Profit Margin', 
       'Asset Turnover', 'Return On Tangible Equity',
       'ROA - Return On Assets', 
       'Operating Cash Flow Per Share', 'Free Cash Flow Per Share'])
reduced_df.head()

Unnamed: 0,Rating,Binary Rating,Debt/Equity Ratio,Net Profit Margin,ROE - Return On Equity,ROI - Return On Investment
0,A-,1,0.8847,8.3224,8.1724,4.453
1,AAA,1,0.0073,13.569,22.0354,21.8765
2,BBB-,1,0.4255,2.1418,13.6376,9.6494
3,AA-,1,0.9491,9.0246,9.6412,5.1018
4,A,1,0.4036,0.6518,11.1256,8.1844


In [4]:
reduced_df['ROE - Return On Equity'].describe()


count     7805.000000
mean        15.950507
std        230.592526
min     -11258.210000
25%          6.022800
50%         12.500000
75%         20.515700
max       7038.461000
Name: ROE - Return On Equity, dtype: float64

In [25]:
reduced_df['Rating'].value_counts()

Rating
BBB     910
BBB+    846
A       836
A-      722
BBB-    705
BB+     545
A+      478
BB      453
BB-     427
B+      422
B       310
B-      288
AA-     254
AA      185
CCC+    151
AAA      90
AA+      73
CCC      65
CC       18
C        11
CCC-     10
D         5
CC+       1
Name: count, dtype: int64

In [5]:
# Determine the number of unique values in each column.
reduced_df.nunique()


Rating                          23
Binary Rating                    2
Debt/Equity Ratio             2484
Net Profit Margin             2642
ROE - Return On Equity        2651
ROI - Return On Investment    2641
dtype: int64

In [6]:
reduced_df.columns

Index(['Rating', 'Binary Rating', 'Debt/Equity Ratio', 'Net Profit Margin',
       'ROE - Return On Equity', 'ROI - Return On Investment'],
      dtype='object')

In [7]:
# Used code from https://stackoverflow.com/questions/14247586/how-to-select-rows-with-one-or-more-nulls-from-a-pandas-dataframe-without-listin
# Look for nulls
def nans(df): return df[df.isnull().any(axis=1)]
nans(reduced_df)


Unnamed: 0,Rating,Binary Rating,Debt/Equity Ratio,Net Profit Margin,ROE - Return On Equity,ROI - Return On Investment


In [8]:
# Checking for NA's
def nans2(df): return df[df.isna().any(axis=1)]
nans2(reduced_df)

Unnamed: 0,Rating,Binary Rating,Debt/Equity Ratio,Net Profit Margin,ROE - Return On Equity,ROI - Return On Investment


In [9]:
# # Look at APPLICATION_TYPE value counts for binning
# application_counts = reduced_df['APPLICATION_TYPE'].value_counts()
# application_counts

In [10]:
# # Choose a cutoff value and create a list of application types to be replaced
# # use the variable name `application_types_to_replace`
# application_types_to_replace = list(application_counts[application_counts < 528].index)

# # Replace in dataframe
# for app in application_types_to_replace:
#     reduced_df['APPLICATION_TYPE'] = reduced_df['APPLICATION_TYPE'].replace(app,"Other")

# # Check to make sure binning was successful
# reduced_df['APPLICATION_TYPE'].value_counts()

In [11]:
# # Look at CLASSIFICATION value counts for binning
# classification_counts = reduced_df['CLASSIFICATION'].value_counts()
# classification_counts

In [12]:
# # You may find it helpful to look at CLASSIFICATION value counts >1
# class_counts_over1 = classification_counts.loc[classification_counts > 1]
# class_counts_over1

In [13]:
# # Choose a cutoff value and create a list of classifications to be replaced
# # use the variable name `classifications_to_replace`
# classifications_to_replace = list(classification_counts[classification_counts < 1000].index)

# # Replace in dataframe
# for cls in classifications_to_replace:
#     reduced_df['CLASSIFICATION'] = reduced_df['CLASSIFICATION'].replace(cls,"Other")

# # Check to make sure binning was successful
# reduced_df['CLASSIFICATION'].value_counts()

In [14]:
# Convert categorical data to numeric with `pd.get_dummies`
reduced_numeric = pd.get_dummies(reduced_df)

In [15]:
reduced_numeric.head()

Unnamed: 0,Binary Rating,Debt/Equity Ratio,Net Profit Margin,ROE - Return On Equity,ROI - Return On Investment,Rating_A,Rating_A+,Rating_A-,Rating_AA,Rating_AA+,...,Rating_BBB,Rating_BBB+,Rating_BBB-,Rating_C,Rating_CC,Rating_CC+,Rating_CCC,Rating_CCC+,Rating_CCC-,Rating_D
0,1,0.8847,8.3224,8.1724,4.453,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,0.0073,13.569,22.0354,21.8765,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1,0.4255,2.1418,13.6376,9.6494,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,1,0.9491,9.0246,9.6412,5.1018,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1,0.4036,0.6518,11.1256,8.1844,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [16]:
reduced_df.head()

Unnamed: 0,Rating,Binary Rating,Debt/Equity Ratio,Net Profit Margin,ROE - Return On Equity,ROI - Return On Investment
0,A-,1,0.8847,8.3224,8.1724,4.453
1,AAA,1,0.0073,13.569,22.0354,21.8765
2,BBB-,1,0.4255,2.1418,13.6376,9.6494
3,AA-,1,0.9491,9.0246,9.6412,5.1018
4,A,1,0.4036,0.6518,11.1256,8.1844


In [17]:
# Split our preprocessed data into our features and target arrays
X = reduced_numeric.drop(['Binary Rating'], axis=1)
y = reduced_numeric['Binary Rating']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [24]:
reduced_numeric.head()

Unnamed: 0,Binary Rating,Debt/Equity Ratio,Net Profit Margin,ROE - Return On Equity,ROI - Return On Investment,Rating_A,Rating_A+,Rating_A-,Rating_AA,Rating_AA+,...,Rating_BBB,Rating_BBB+,Rating_BBB-,Rating_C,Rating_CC,Rating_CC+,Rating_CCC,Rating_CCC+,Rating_CCC-,Rating_D
0,1,0.8847,8.3224,8.1724,4.453,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,0.0073,13.569,22.0354,21.8765,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1,0.4255,2.1418,13.6376,9.6494,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,1,0.9491,9.0246,9.6412,5.1018,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1,0.4036,0.6518,11.1256,8.1844,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [18]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


## Compile, Train and Evaluate the Model

In [19]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 4
hidden_nodes_layer2 = 3
hidden_nodes_layer3 = 2

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,
             input_dim=number_input_features, activation="relu"))

# # Second hidden layer
# nn.add(tf.keras.layers.Dense(
#     units=hidden_nodes_layer2, activation="relu"))

# # Third hidden layer
# nn.add(tf.keras.layers.Dense(
#     units=hidden_nodes_layer3, activation="sigmoid"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 4)                 112       
                                                                 
 dense_1 (Dense)             (None, 1)                 5         
                                                                 
Total params: 117
Trainable params: 117
Non-trainable params: 0
_________________________________________________________________


In [20]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
# Train the model
# Used code from https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint
# and from https://stackoverflow.com/questions/44886509/keras-save-checkpoints

EPOCHS = 20
checkpoint_filepath = '../models/model2/checkpoints/weights.epoch_{epoch:02d}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_freq=5,
    verbose=True)

# Model weights are saved at the end of every epoch, if it's the best seen
# so far.
fit_model = nn.fit(X_train_scaled, y_train, epochs=EPOCHS, callbacks=[model_checkpoint_callback])



Epoch 1/20


2024-03-14 21:08:37.653479: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


  1/183 [..............................] - ETA: 56s - loss: 0.5841 - accuracy: 0.6875
Epoch 1: saving model to ../models/model2/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model2/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model2/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model2/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model2/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model2/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model2/checkpoints/weights.epoch_01.hdf5
 37/183 [=====>........................] - ETA: 0s - loss: 0.6866 - accuracy: 0.6436 
Epoch 1: saving model to ../models/model2/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model2/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model2/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model2/checkpoints/weigh

In [22]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

61/61 - 0s - loss: 0.0192 - accuracy: 0.9964 - 89ms/epoch - 1ms/step
Loss: 0.019173936918377876, Accuracy: 0.9964139461517334


In [23]:
# Export our model to HDF5 file
nn.save('../models/model2/model.h5')
