## Download Dataset/Import Dependencies


In [1]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [2]:
!kaggle datasets download iammustafatz/diabetes-prediction-dataset

Dataset URL: https://www.kaggle.com/datasets/iammustafatz/diabetes-prediction-dataset
License(s): copyright-authors
Downloading diabetes-prediction-dataset.zip to /content
  0% 0.00/734k [00:00<?, ?B/s]
100% 734k/734k [00:00<00:00, 34.5MB/s]


In [3]:
!unzip diabetes-prediction-dataset.zip

Archive:  diabetes-prediction-dataset.zip
  inflating: diabetes_prediction_dataset.csv  


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input

import numpy as np
import pandas as pd

## Data Exploration/Transformation

In [5]:
df = pd.read_csv("diabetes_prediction_dataset.csv")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [7]:
X = df.drop('diabetes', axis=1)
y = df['diabetes']

# Encode categorical features
X = pd.get_dummies(X, columns=['gender', 'smoking_history'])

# Split data into training, validation, and testing sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

## Initial baseline models

As a baseline to compare the Neural Network to, we will train a RandomForest model on the data and compare how well it does to the neural network.

In [10]:
# Train a RandomForest Classifier
rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_val)

# Compute metrics
rf_rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rf_r2 = r2_score(y_val, y_pred)

# Test predictions
y_test_pred = rf_model.predict(X_test)
rf_test_accuracy = accuracy_score(y_test, y_test_pred)

# Print results
print(f"RandomForest RMSE: {rf_rmse:.4f}")
print(f"RandomForest R2 Score: {rf_r2:.4f}")
print(f"RandomForest Classifier Accuracy (test set): {rf_test_accuracy:.4f}")


RandomForest RMSE: 0.1715
RandomForest R2 Score: 0.6176
RandomForest Classifier Accuracy (test set): 0.9696


## Training the Feed-forward Neural Network

In [12]:
# Define Model
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(16, activation='relu'),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train Model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/20
[1m1993/1993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9096 - loss: 0.5428 - val_accuracy: 0.9160 - val_loss: 0.2336
Epoch 2/20
[1m1993/1993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9141 - loss: 0.2353 - val_accuracy: 0.9160 - val_loss: 0.1952
Epoch 3/20
[1m1993/1993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.9129 - loss: 0.1946 - val_accuracy: 0.9160 - val_loss: 0.1887
Epoch 4/20
[1m1993/1993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.9383 - loss: 0.1671 - val_accuracy: 0.9392 - val_loss: 0.1452
Epoch 5/20
[1m1993/1993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9475 - loss: 0.1476 - val_accuracy: 0.9578 - val_loss: 0.1295
Epoch 6/20
[1m1993/1993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.9504 - loss: 0.1411 - val_accuracy: 0.9368 - val_loss: 0.1565
Epoch 7/20
[

<keras.src.callbacks.history.History at 0x7b015c3bd490>

In [8]:
model.summary()

### Tune the Model

In [14]:
num_nodes_layer_1 = [16, 32, 64]
num_nodes_layer_2 = [16, 32, 64]
num_epochs = [5,10,20]

for n1 in num_nodes_layer_1:
    for n2 in num_nodes_layer_2:
        for e in num_epochs:
            model = Sequential([
                Input(shape=(X_train.shape[1],)),
                Dense(n1, activation = 'relu'),
                Dense(n2, activation = 'relu'),
                Dense(1, activation = 'sigmoid')
            ])
            model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics= ['accuracy'])
            history = model.fit(x = X_train, y = y_train, validation_data = (X_val, y_val), epochs = e, batch_size = 16, verbose=0)
            print(f'Nodes in layer 1: {n1}, Nodes in layer 2: {n2}, Epochs: {e} Accuracy: {model.evaluate(X_val, y_val)[1]}')

[1m665/665[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9592 - loss: 0.1088
Nodes in layer 1: 16, Nodes in layer 2: 16, Epochs: 5 Accuracy: 0.9583058953285217
[1m665/665[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9640 - loss: 0.1025
Nodes in layer 1: 16, Nodes in layer 2: 16, Epochs: 10 Accuracy: 0.9644705653190613
[1m665/665[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9594 - loss: 0.1038
Nodes in layer 1: 16, Nodes in layer 2: 16, Epochs: 20 Accuracy: 0.9587293863296509
[1m665/665[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9565 - loss: 0.1221
Nodes in layer 1: 16, Nodes in layer 2: 32, Epochs: 5 Accuracy: 0.9555293917655945
[1m665/665[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9624 - loss: 0.1008
Nodes in layer 1: 16, Nodes in layer 2: 32, Epochs: 10 Accuracy: 0.9626352787017822
[1m665/665[0m [32m━━━━━━━━━━━━━━━━

Constrained by time, I tested a feed-forward neural network with 2 hidden layers and tuned the follwoing parameters: number of nodes in the hidden layers of either 16, 32, or 64 and the number epochs at values 5, 10, or 20.

Ultimately the best of the 27 variations tried was with 32 nodes in the first hidden layer, 64 in the second, and 20 epochs. This variation was able to achieve an accuracy of 0.966.

Overall, the variations did not make a huge difference with only worst accuracy reaching 0.951.


## Accuracy in the Test Set

In [13]:
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train Model
model.fit(X_train, y_train, epochs=20, batch_size=16, validation_data=(X_val, y_val))

Epoch 1/20
[1m3985/3985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 5ms/step - accuracy: 0.9103 - loss: 0.3022 - val_accuracy: 0.9357 - val_loss: 0.1888
Epoch 2/20
[1m3985/3985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - accuracy: 0.9372 - loss: 0.1811 - val_accuracy: 0.9532 - val_loss: 0.1306
Epoch 3/20
[1m3985/3985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 3ms/step - accuracy: 0.9460 - loss: 0.1494 - val_accuracy: 0.9530 - val_loss: 0.1263
Epoch 4/20
[1m3985/3985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - accuracy: 0.9520 - loss: 0.1326 - val_accuracy: 0.9554 - val_loss: 0.1259
Epoch 5/20
[1m3985/3985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.9557 - loss: 0.1242 - val_accuracy: 0.9583 - val_loss: 0.1202
Epoch 6/20
[1m3985/3985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.9573 - loss: 0.1188 - val_accuracy: 0.9573 - val_loss: 0.1137
Epoch 7/20

[0.09756916761398315, 0.9639333486557007]

In [14]:
model.summary()

In [18]:
# Evaluate Accuracy with the test set
evaluation = model.evaluate(X_test, y_test)
print(f"Feed-forward Neural Network Test Set Accuracy: {evaluation[1]}")

[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9639 - loss: 0.0960
Feed-forward Neural Network Test Set Accuracy: 0.9639333486557007


In [19]:
print(f"RandomForest Classifier Test Set Accuracy: {rf_test_accuracy:.4f}")

RandomForest Classifier Test Set Accuracy: 0.9696


## Conclusion

The RandomForest Classifier performed slightly better on the test set, but ultimately both were very close.