<a href="https://colab.research.google.com/github/emilybguo/CS229/blob/main/NNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook holds code for data preprocessing and the various versions of NN models explored.

# Data Preprocessing

In this file, we preprocess the data to prepare it for testing on machine learning algorithms. This includes controlling for variables, removing those we do not want to consider, and splitting catagorical data into one-hot vectors that allow us to include them in algorithms.

In [None]:
from google.colab import files
import numpy as np
import pandas as pd 
import sklearn 

# For standardizing dataset
from sklearn import preprocessing

# library for multiclass model metrics
import disarray

import matplotlib.pyplot as plt

 # For splitting of data into train and test set
from sklearn.model_selection import train_test_split
 
# For metrics and confusion matrix
from sklearn.metrics  import confusion_matrix, ConfusionMatrixDisplay
from sklearn import metrics

# library for nural network feature importance
import shap


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.utils import to_categorical


In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')

In [None]:
df=pd.read_csv('/content/gdrive/MyDrive/2022-2023/Fall/CS229/CS229 project/cleaned_data_12_09.csv')
df = df.drop('loan_amount', axis=1)
df.head()

In [None]:
x = df.drop(columns = ['action_taken']).copy()
y = df['action_taken']

# first, split data into training and remaining datasets, with training as 80% of 
# original dataset size
x_train, x_rem, y_train, y_rem = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=0)

# next, split remaining data into validation and test datasets, with both as 10%
# of the original dataset size (50% of the remaining 20%)
x_valid, x_test, y_valid, y_test = train_test_split(x_rem, y_rem, test_size=0.5, shuffle=True, random_state=0)

In [None]:
d = {}

for x in y_train:
    d[x] = d.get(x,0) + 1

# printing result
print(f"The list frequency of elements in y_train is : {d}" )

e = {}

for x in y_valid:
    e[x] = e.get(x,0) + 1

# printing result
print(f"The list frequency of elements in y_valid is : {e}" )

f = {}

for x in y_test:
    f[x] = f.get(x,0) + 1

# printing result
print(f"The list frequency of elements in y_valid is : {f}" )

In [None]:
scaler = preprocessing.StandardScaler().fit(x_train)

x_train = scaler.transform(x_train)
x_valid = scaler.transform(x_valid)
x_test = scaler.transform(x_test)

In [None]:
y_train = y_train.to_numpy()
y_valid = y_valid.to_numpy()
y_test = y_test.to_numpy()

In [None]:
def make_categorical(labels):
  labels_categorical = np.zeros((len(labels), 3))
  for i in range(len(labels)):
    if labels[i] == 2:
      labels_categorical[i][0] = 1
    if labels[i] == 3:
      labels_categorical[i][1] = 1
    if labels[i] == 6:
      labels_categorical[i][2] = 1
  return labels_categorical

In [None]:
def single_predictions(softmax_predictions):
  single_vals = []
  for i in range(len(softmax_predictions)):
    single_vals.append(np.argmax(softmax_predictions[i]))
  return single_vals

In [None]:
y_train_categorical = make_categorical(y_train)
y_valid_categorical = make_categorical(y_valid)
y_test_categorical = make_categorical(y_test)

# Machine learning algorithms

## Neural Networks

In [None]:
nn1 = Sequential()
nn1.add(Dense(12, input_shape=(60,), activation='sigmoid'))
nn1.add(Dense(8, activation='sigmoid'))
nn1.add(Dense(3, activation='softmax'))

nn1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
nn1.fit(x_train, y_train_categorical, epochs=60, batch_size=10)

In [None]:
print(nn1.evaluate(x_valid, y_valid_categorical))

In [None]:
print(y_valid_categorical)
cm = confusion_matrix(single_predictions(y_valid_categorical), single_predictions(nn1.predict(x_valid)), labels=[0, 1, 2])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=[2, 3, 6])
disp.plot()

plt.show()

In [None]:
nn2 = Sequential()
nn2.add(Dense(300, input_shape=(60,), activation='ReLU'))
nn2.add(Dense(3, activation='softmax'))

nn2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
nn2.fit(x_train, y_train_categorical, epochs=30, batch_size=10)

In [None]:
print(nn2.evaluate(x_valid, y_valid_categorical))

In [None]:
print(y_valid_categorical)
cm = confusion_matrix(single_predictions(y_valid_categorical), single_predictions(nn2.predict(x_valid)), labels=[0, 1, 2])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=[2, 3, 6])
disp.plot()

plt.show()

In [None]:
nn3 = Sequential()
nn3.add(Dense(20, input_shape=(60,), activation='sigmoid'))
nn3.add(Dense(20, activation='sigmoid'))
nn3.add(Dense(20, activation='sigmoid'))
nn3.add(Dense(20, activation='sigmoid'))
nn3.add(Dense(20, activation='sigmoid'))
nn3.add(Dense(20, activation='sigmoid'))
nn3.add(Dense(20, activation='sigmoid'))
nn3.add(Dense(20, activation='sigmoid'))
nn3.add(Dense(20, activation='sigmoid'))
nn3.add(Dense(20, activation='sigmoid'))
nn3.add(Dense(20, activation='sigmoid'))
nn3.add(Dense(20, activation='sigmoid'))
nn3.add(Dense(3, activation='softmax'))

nn3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
nn3.fit(x_train, y_train_categorical, epochs=60, batch_size=10)

In [None]:
print(nn3.evaluate(x_valid, y_valid_categorical))

In [None]:
print(y_valid_categorical)
cm = confusion_matrix(single_predictions(y_valid_categorical), single_predictions(nn3.predict(x_valid)), labels=[0, 1, 2])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=[2, 3, 6])
disp.plot()

plt.show()

## Testing the Models

In [None]:
print(nn1.evaluate(x_test, y_test_categorical))

cm = confusion_matrix(single_predictions(y_test_categorical), single_predictions(nn1.predict(x_test)), labels=[0, 1, 2])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=[2, 3, 6])
disp.plot()
plt.title("NN Two Small Sigmoid Layers Confusion Matrix")
plt.show()

metrics = pd.DataFrame(cm, dtype=int)
metrics.da.precision, metrics.da.recall

In [None]:
print(nn2.evaluate(x_test, y_test_categorical))

cm = confusion_matrix(single_predictions(y_test_categorical), single_predictions(nn2.predict(x_test)), labels=[0, 1, 2])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=[2, 3, 6])
disp.plot()
plt.title("NN One Large ReLU Layer Confusion Matrix")
plt.show()

metrics = pd.DataFrame(cm, dtype=int)
metrics.da.precision, metrics.da.recall

In [None]:
print(nn3.evaluate(x_test, y_test_categorical))

cm = confusion_matrix(single_predictions(y_test_categorical), single_predictions(nn3.predict(x_test)), labels=[0, 1, 2])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=[2, 3, 6])
disp.plot()
plt.title("NN Thirteen Sigmoid Layers Confusion Matrix")
plt.show()

metrics = pd.DataFrame(cm, dtype=int)
metrics.da.precision, metrics.da.recall