In [None]:
# build a RF model on diabetes data
# build FFNN model on diabetes data

In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score, roc_curve, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
names = ["times_pregnant", "glucose_tolerance_test", "blood_pressure", "skin_thickness", "insulin", 
         "bmi", "pedigree_function", "age", "has_diabetes"]
diabetes_df = pd.read_csv('data/diabetes.csv', names=names, header=0)
#diabetes_df = pd.read_csv('data/diabetes.csv', header=0)

In [None]:
diabetes_df.head()

In [None]:
print(diabetes_df.shape)
diabetes_df.sample(5)

In [None]:
X = diabetes_df.iloc[:, :-1].values
y = diabetes_df["has_diabetes"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=11111)

In [None]:
np.mean(y), np.mean(1-y)

In [None]:
rf_model = RandomForestClassifier(n_estimators=200)
rf_model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set - both "hard" predictions, and the scores (percent of trees voting yes)
y_pred_class_rf = rf_model.predict(X_test)
y_pred_prob_rf = rf_model.predict_proba(X_test)

In [None]:
print('accuracy is {:.3f}'.format(accuracy_score(y_test,y_pred_class_rf)))
print('roc-auc is {:.3f}'.format(roc_auc_score(y_test,y_pred_prob_rf[:,1])))

In [None]:
metrics.plot_roc_curve(rf_model, X_test, y_test)

In [None]:
## Import Keras objects for Deep Learning
from tensorflow.keras.models  import Sequential
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, SGD, RMSprop

In [None]:
## normalize the data
normalizer = StandardScaler()
X_train_norm = normalizer.fit_transform(X_train)
X_test_norm = normalizer.transform(X_test)

In [None]:
model_1 = Sequential()
model_1.add(Dense(12, input_shape=(8,), activation='sigmoid'))
model_1.add(Dense(1, activation='sigmoid'))

In [None]:
model_1.summary()

In [None]:
model_1.compile(SGD(lr=0.003), "binary_crossentropy", metrics=["accuracy"])
run_hist_1 = model_1.fit(X_train_norm, y_train, validation_data=(X_test_norm, y_test), epochs=200)

In [None]:
y_pred_prob_nn_1 = model_1.predict(X_test_norm)
""" Please use instead:* np.argmax(model.predict(x), axis=-1), if your model does multi-class classification (e.g. if it uses a softmax last-layer 
activation).* (model.predict(x) > 0.5).astype("int32"), if your model does binary classification (e.g. if it uses a sigmoid last-layer activation).
"""
y_pred_class_nn_1 = (y_pred_prob_nn_1>0.5).astype("int32")

In [None]:
y_pred_class_nn_1[:10]

In [None]:
y_pred_prob_nn_1[:10]

In [None]:
## Get accuracy
print('accuracy is {:3f}'.format(accuracy_score(y_test, y_pred_class_nn_1)))
print('roc-auc is {:3f}'.format(roc_auc_score(y_test, y_pred_prob_nn_1)))

In [None]:
## Plot ROC curve

def plot_roc(y_test, y_pred, model_name):
    fpr, tpr, thr = roc_curve(y_test, y_pred)
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.plot(fpr, tpr, 'k-', label='auc-roc')
    ax.plot([0, 1], [0, 1], 'k--', linewidth=.5, label='random')  # roc curve for random model
    ax.grid(True)
    ax.set(title='ROC Curve for {} on PIMA diabetes problem'.format(model_name),
           xlim=[-0.01, 1.01], ylim=[-0.01, 1.01])
    ax.legend()
    
plot_roc(y_test, y_pred_prob_nn_1, 'NN1')
### END SOLUTION

In [None]:
## Plot losses
# run_hist_1.history
run_hist_1.history.keys()

In [None]:
fig, ax = plt.subplots()
ax.plot(run_hist_1.history['loss'], 'r', marker='.', label='Train Loss')
ax.plot(run_hist_1.history['val_loss'], 'b', marker='.', label='Validation Loss')
ax.legend()

In [None]:
run_hist_1b = model_1.fit(X_train_norm, y_train, validation_data=(X_test_norm, y_test), epochs=1000)

In [None]:
n = len(run_hist_1.history["loss"])
m = len(run_hist_1b.history['loss'])
fig, ax = plt.subplots(figsize=(16, 8))

ax.plot(range(n),run_hist_1.history["loss"],'r', marker='.', label="Train Loss - Run 1")
ax.plot(range(n,n+m),run_hist_1b.history["loss"], 'hotpink', marker='.', label="Train Loss - Run 2")

ax.plot(range(n),run_hist_1.history["val_loss"],'b', marker='.', label="Validation Loss - Run 1")
ax.plot(range(n,n+m),run_hist_1b.history["val_loss"], 'LightSkyBlue', marker='.',  label="Validation Loss - Run 2")

ax.legend()

### Exercise 2

In [None]:
model_2 = Sequential()
model_2.add(Dense(6, input_shape=(8,), activation='relu'))
model_2.add(Dense(6, activation='relu'))
model_2.add(Dense(1, activation='sigmoid'))

In [None]:
model_2.summary()

In [None]:
model_2.compile(SGD(learning_rate=0.003), 'binary_crossentropy', metrics=['accuracy'])
run_hist_2 = model_2.fit(X_train_norm, y_train, validation_data=(X_test_norm, y_test), epochs=1500)

In [None]:
y_pred_prob_nn_2 = model_2.predict(X_test_norm)
y_pred_class_nn_2 =(y_pred_prob_nn_2>0.5).astype("int32")

print('')
print('accuracy is {:.3f}'.format(accuracy_score(y_test,y_pred_class_nn_2)))
print('roc-auc is {:.3f}'.format(roc_auc_score(y_test,y_pred_prob_nn_2)))

In [None]:
plot_roc(y_test, y_pred_prob_nn_2, 'NN-3')

In [None]:
run_hist_2.history.keys()

In [None]:
n = len(run_hist_2.history["loss"])
fig, ax = plt.subplots(figsize=(16,8))

ax = fig.add_subplot(1, 2,1)
ax.plot(run_hist_2.history['loss'], 'r', marker='.',label='Train Loss')
ax.plot(run_hist_2.history['val_loss'], 'b', marker='.',label='Val Loss')
ax.legend()
ax.set_title('Loss over iterations')

ax = fig.add_subplot(1, 2,2)
ax.plot(run_hist_2.history["accuracy"],'r.', label="Train Acc")
ax.plot(run_hist_2.history["val_accuracy"],'b.', label="Validation Acc")
ax.legend(loc='lower right')
ax.set_title('Accuracy over iterations')

### Exercise 3

- 3 hidden layers, 4, 5, 6 nodes
- relu activation
- sigmoid for last
- learning rate = 0.003 and 500 epochs
- graph the losses
- plot roc

In [None]:
model_3 = Sequential()
model_3.add(Dense(4, input_shape = (8,) , activation='relu'))
model_3.add(Dense(5, activation = 'relu'))
model_3.add(Dense(6, activation='relu'))
model_3.add(Dense(1, activation='sigmoid'))          
                  

In [None]:
model_3.summary()

In [None]:
model_3.compile(Adam(learning_rate=0.003), 'binary_crossentropy' ,metrics=["accuracy"])
run_hist_3 = model_3.fit(X_train_norm, y_train, validation_data=(X_test_norm, y_test), epochs = 1000)

In [None]:
y_pred_prob_nn_3 = model_3.predict(X_test_norm)
y_pred_class_nn_3 = (y_pred_prob_nn_3>0.5).astype('int32')

In [None]:
print('accuracy: {}'.format(accuracy_score(y_test, y_pred_class_nn_3)))
print('auc_roc: {}'.format(roc_auc_score(y_test, y_pred_prob_nn_3)))


In [None]:
run_hist_3.history.keys()



In [None]:
fig = plt.figure(figsize=(16,8))


ax = fig.add_subplot(1,2,1)
ax.plot(run_hist_3.history['loss'], 'r', marker= '.', label ='Train loss')
ax.plot(run_hist_3.history['val_loss'], 'b', marker= '.', label ='Val loss')
ax.legend()

ax = fig.add_subplot(1,2,2)
ax.plot(run_hist_3.history['accuracy'], 'r', marker= '.', label ='Train acc')
ax.plot(run_hist_3.history['val_accuracy'], 'b', marker= '.', label ='Val acc')
ax.legend()

In [None]:
fpr, tpr, thr = roc_curve(y_test, y_pred_prob_nn_3)
fig, ax = plt.subplots(figsize=(8, 8))
ax.plot(fpr, tpr, 'k-')
ax.plot([0, 1], [0, 1], 'k--', linewidth=.5)  # roc curve for random model
ax.grid(True)
ax.set(title='ROC Curve for {} on PIMA diabetes problem'.format('NN3'),
           xlim=[-0.01, 1.01], ylim=[-0.01, 1.01])
