## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import sqlite3




In [2]:
connection = sqlite3.connect('../data/db.sqlite')

# Import DB into pandas dataframe
df = pd.read_sql_query("SELECT * FROM pca", connection)

connection.close()

df.head()

Unnamed: 0,Corporation,PC1,Binary Rating,PC2,Cluster
0,American States Water Co.,319741.641714,1,801.328232,0
1,Automatic Data Processing Inc.,-728490.657302,1,3519.620476,1
2,Avnet Inc.,-728303.285091,1,1210.233135,1
3,California Water Service Co.,298039.642473,1,807.233946,0
4,Cardinal Health Inc.,-15790.29668,1,1073.498944,0


In [3]:
df.columns

Index(['Corporation', 'PC1', 'Binary Rating', 'PC2', 'Cluster'], dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7805 entries, 0 to 7804
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Corporation    7805 non-null   object 
 1   PC1            7805 non-null   float64
 2   Binary Rating  7805 non-null   int64  
 3   PC2            7805 non-null   float64
 4   Cluster        7805 non-null   int64  
dtypes: float64(2), int64(2), object(1)
memory usage: 305.0+ KB


In [5]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
reduced_df = df.drop(columns=['Corporation'])
reduced_df.head()

Unnamed: 0,PC1,Binary Rating,PC2,Cluster
0,319741.641714,1,801.328232,0
1,-728490.657302,1,3519.620476,1
2,-728303.285091,1,1210.233135,1
3,298039.642473,1,807.233946,0
4,-15790.29668,1,1073.498944,0


In [6]:
# Determine the number of unique values in each column.
reduced_df.nunique()


PC1              4552
Binary Rating       2
PC2              4552
Cluster             3
dtype: int64

In [7]:
reduced_df.columns

Index(['PC1', 'Binary Rating', 'PC2', 'Cluster'], dtype='object')

In [8]:
# Used code from https://stackoverflow.com/questions/14247586/how-to-select-rows-with-one-or-more-nulls-from-a-pandas-dataframe-without-listin
# Look for nulls
def nans(df): return df[df.isnull().any(axis=1)]
nans(reduced_df)


Unnamed: 0,PC1,Binary Rating,PC2,Cluster


In [9]:
# Checking for NA's
def nans2(df): return df[df.isna().any(axis=1)]
nans2(reduced_df)

Unnamed: 0,PC1,Binary Rating,PC2,Cluster


In [10]:
# # Look at APPLICATION_TYPE value counts for binning
# application_counts = reduced_df['APPLICATION_TYPE'].value_counts()
# application_counts

In [11]:
# # Choose a cutoff value and create a list of application types to be replaced
# # use the variable name `application_types_to_replace`
# application_types_to_replace = list(application_counts[application_counts < 528].index)

# # Replace in dataframe
# for app in application_types_to_replace:
#     reduced_df['APPLICATION_TYPE'] = reduced_df['APPLICATION_TYPE'].replace(app,"Other")

# # Check to make sure binning was successful
# reduced_df['APPLICATION_TYPE'].value_counts()

In [12]:
# # Look at CLASSIFICATION value counts for binning
# classification_counts = reduced_df['CLASSIFICATION'].value_counts()
# classification_counts

In [13]:
# # You may find it helpful to look at CLASSIFICATION value counts >1
# class_counts_over1 = classification_counts.loc[classification_counts > 1]
# class_counts_over1

In [14]:
# # Choose a cutoff value and create a list of classifications to be replaced
# # use the variable name `classifications_to_replace`
# classifications_to_replace = list(classification_counts[classification_counts < 1000].index)

# # Replace in dataframe
# for cls in classifications_to_replace:
#     reduced_df['CLASSIFICATION'] = reduced_df['CLASSIFICATION'].replace(cls,"Other")

# # Check to make sure binning was successful
# reduced_df['CLASSIFICATION'].value_counts()

In [15]:
# Convert categorical data to numeric with `pd.get_dummies`
reduced_numeric = pd.get_dummies(reduced_df)

In [16]:
reduced_numeric.head()

Unnamed: 0,PC1,Binary Rating,PC2,Cluster
0,319741.641714,1,801.328232,0
1,-728490.657302,1,3519.620476,1
2,-728303.285091,1,1210.233135,1
3,298039.642473,1,807.233946,0
4,-15790.29668,1,1073.498944,0


In [17]:
reduced_df.head()

Unnamed: 0,PC1,Binary Rating,PC2,Cluster
0,319741.641714,1,801.328232,0
1,-728490.657302,1,3519.620476,1
2,-728303.285091,1,1210.233135,1
3,298039.642473,1,807.233946,0
4,-15790.29668,1,1073.498944,0


In [18]:
# Split our preprocessed data into our features and target arrays
X = reduced_numeric.drop(['Binary Rating'], axis=1)
y = reduced_numeric['Binary Rating']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [19]:
reduced_numeric.head()

Unnamed: 0,PC1,Binary Rating,PC2,Cluster
0,319741.641714,1,801.328232,0
1,-728490.657302,1,3519.620476,1
2,-728303.285091,1,1210.233135,1
3,298039.642473,1,807.233946,0
4,-15790.29668,1,1073.498944,0


In [20]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


## Compile, Train and Evaluate the Model

In [21]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
# hidden_nodes_layer1 = 3
# hidden_nodes_layer2 = 3
# hidden_nodes_layer3 = 3
# hidden_nodes_layer4 = 2

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=3,
             input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(
    units=3, activation="relu"))

# # Third hidden layer
# nn.add(tf.keras.layers.Dense(
#     units=3, activation="relu"))

# nn.add(tf.keras.layers.Dense(
#     units=hidden_nodes_layer4, activation="sigmoid"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 3)                 12        
                                                                 
 dense_1 (Dense)             (None, 3)                 12        
                                                                 
 dense_2 (Dense)             (None, 1)                 4         
                                                                 
Total params: 28
Trainable params: 28
Non-trainable params: 0
_________________________________________________________________


In [22]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
# Train the model
# Used code from https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint
# and from https://stackoverflow.com/questions/44886509/keras-save-checkpoints

EPOCHS = 75
checkpoint_filepath = '../models/model6/checkpoints/weights.epoch_{epoch:02d}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_freq=5,
    verbose=True)

# Model weights are saved at the end of every epoch, if it's the best seen
# so far.
fit_model = nn.fit(X_train_scaled, y_train, epochs=EPOCHS, callbacks=[model_checkpoint_callback])



Epoch 1/75


2024-03-16 16:12:30.826562: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


  1/183 [..............................] - ETA: 1:03 - loss: 0.7326 - accuracy: 0.5625
Epoch 1: saving model to ../models/model6/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model6/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model6/checkpoints/weights.epoch_01.hdf5
 18/183 [=>............................] - ETA: 0s - loss: 0.7745 - accuracy: 0.3264  
Epoch 1: saving model to ../models/model6/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model6/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model6/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model6/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model6/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model6/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model6/checkpoints/weights.epoch_01.hdf5

Epoch 1: saving model to ../models/model6/checkpoints/wei

In [24]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

61/61 - 0s - loss: 0.5882 - accuracy: 0.6993 - 95ms/epoch - 2ms/step
Loss: 0.5881734490394592, Accuracy: 0.6992827653884888


In [25]:
# Export our model to HDF5 file
nn.save('../models/model6/model.h5')
