In [None]:
# Midterm-homework: optmizing pima-indian diabetes prediction
import tensorflow as tf
import numpy as np

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '0' # Set to -1 if CPU should be used CPU = -1 , GPU = 0

gpus = tf.config.experimental.list_physical_devices('GPU')
cpus = tf.config.experimental.list_physical_devices('CPU')

if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)
elif cpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        logical_cpus= tf.config.experimental.list_logical_devices('CPU')
        print(len(cpus), "Physical CPU,", len(logical_cpus), "Logical CPU")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [None]:
from pandas import read_csv
from numpy import set_printoptions

# <Option 1> change scaler/normalizer using scikit-learn module

from sklearn.preprocessing import StandardScaler
pimafilename = "pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(pimafilename, names=names)
xy = dataframe.values

x_original = xy[:, 0:-1]
y_data = xy[:, [-1]]

scaler = StandardScaler().fit(x_original)
x_data = scaler.fit_transform(x_original)

In [None]:
import pprint
pprint.pprint(x_original[:5])
pprint.pprint(x_data[:5])
pprint.pprint(y_data[:5])

In [None]:
print(x_data.shape, y_data.shape)

In [None]:
# split into train test sets
from sklearn.model_selection import train_test_split

# <Caveat> Do not change train:test ratios

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.33)

In [None]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

In [None]:
# optimizing: start over at this point after changing your DNN model
tf.model = tf.keras.Sequential()

In [None]:
# multi-variable, x_data.shape[1] == feature counts == 8 in this case

# <Option 2> change DNN layers, activation functions, dimensionality of output space (units), initializers, regularizers
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense

tf.model.add(tf.keras.layers.Dense(12, activation='relu'))
tf.model.add(tf.keras.layers.Dense(8, activation='relu'))

# Do not change last Dense layer: Dense(1, activation='sigmoid')
tf.model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
# <Option 3> change optimizer, metrics, loss_weights, learning rate
# https://www.tensorflow.org/api_docs/python/tf/keras/Model#compile

tf.model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.SGD(lr=0.02),  metrics=['accuracy'])

In [None]:
# <Option 4> change epochs
# batch_size: smaller batch_size for small memory machine
# https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit

history = tf.model.fit(x_train, y_train, batch_size=32, epochs=500, verbose=0)

#from tqdm.keras import TqdmCallback
#history = tf.model.fit(x_train, y_train, batch_size=32, epochs=500, verbose=0, callbacks=[TqdmCallback(verbose=0)])

In [None]:
tf.model.summary()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(history.history['loss'])
plt.plot(history.history['accuracy'])
plt.title('Model loss & accuracy')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['loss', 'accuracy'], loc='upper left')
plt.show()

pprint.pprint(history.history['accuracy'][-5:])
pprint.pprint(history.history['loss'][-5:])

In [None]:
# accuracy!
print(f"Accuracy: {history.history['accuracy'][-1]}")

In [None]:
# <Caveat> Training accuracy can be 100%, but the mission is to get best test accuracy. Please do not focus on finding 100% on accuracy on training set
# Instead, consider two factors (1) maximizing accuracy (2) minimizing loss
# <Caveat> Results of deep learning can be dependent upon random numbers, so if you do train_test_split more than one time, the results would be different

# evaluating model
evaluate = tf.model.evaluate(x_test, y_test)
print(f"loss: {evaluate[0]}, accuracy: {evaluate[1]}")