# DATA RETRIEVAL

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
stroke_df = pd.read_csv('cleaned_data.csv')
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


# Data Cleaning

In [5]:
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [6]:
#Drop null values
stroke_df = stroke_df.dropna()
#Drop duplicates
stroke_df = stroke_df.drop_duplicates()
stroke_df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [7]:
X = stroke_df.drop(columns = ['id','stroke'])
y = stroke_df['stroke']

In [8]:
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked


In [9]:
y.value_counts()

0    4700
1     209
Name: stroke, dtype: int64

# Data Transformation

In [10]:
X = pd.get_dummies(X)

In [11]:
X.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,0,1,0,0,1,...,0,1,0,0,0,1,0,1,0,0
2,80.0,0,1,105.92,32.5,0,1,0,0,1,...,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,0,0,0,1,...,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,0,0,0,1,...,0,0,1,0,1,0,0,0,1,0
5,81.0,0,0,186.21,29.0,0,1,0,0,1,...,0,1,0,0,0,1,0,1,0,0


In [12]:
# Split the dataset using train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [13]:
# Instantiate a StandardScaler instance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# Create a Keras Sequential model and add more than one Dense hidden layer
import tensorflow as tf
nn_model = tf.keras.models.Sequential()

nn_model.add(tf.keras.layers.Dense(units=6, activation="relu", input_dim=21))

nn_model.add(tf.keras.layers.Dense(units=6, activation="relu"))

nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the Sequential model
nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 6)                 132       
                                                                 
 dense_1 (Dense)             (None, 6)                 42        
                                                                 
 dense_2 (Dense)             (None, 1)                 7         
                                                                 
Total params: 181 (724.00 Byte)
Trainable params: 181 (724.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [15]:
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [16]:
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

39/39 - 0s - loss: 0.1850 - accuracy: 0.9471 - 194ms/epoch - 5ms/step
Loss: 0.18496212363243103, Accuracy: 0.9470683932304382


In [17]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


In [18]:
predictions = nn_model.predict(X_test_scaled)
predictions = np.round(predictions, decimals = 0)



In [19]:
y_test.value_counts()

0    1170
1      58
Name: stroke, dtype: int64

In [20]:
#accuracy_score
accuracy_score(y_test,predictions)

0.9470684039087948

In [21]:
#confusion_matrix and classification report
confusion_matrix(y_test,predictions)
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame( cm, index=['Actual No Stroke', "Actual Stroke"], columns = ['Predicted No Stroke', 'Predicted Stroke'] )
rf_acc_score = accuracy_score(y_test,predictions)
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {rf_acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted No Stroke,Predicted Stroke
Actual No Stroke,1163,7
Actual Stroke,58,0


Accuracy Score : 0.9470684039087948
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1170
           1       0.00      0.00      0.00        58

    accuracy                           0.95      1228
   macro avg       0.48      0.50      0.49      1228
weighted avg       0.91      0.95      0.93      1228



# OPTIMISE DEEP LEARNING MODEL

In [24]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.2-py3-none-any.whl (127 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.5/127.5 kB[0m [31m955.8 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-core (from keras-tuner)
  Downloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Collecting namex (from keras-core->keras-tuner)
  Downloading namex-0.0.7-py3-none-any.whl (5.8 kB)
Installing collected packages: namex, kt-legacy, keras-core, keras-tuner
Successfully installed keras-core-0.1.7 keras-tuner-1.4.2 kt-legacy-1.0.5 namex-0.0.7


In [25]:
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh'])

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=30,
        step=5), activation=activation, input_dim=21))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 5)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=30,
            step=5),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [26]:
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=100,
    hyperband_iterations=2)

Using TensorFlow backend


In [None]:
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 501 Complete [00h 00m 43s]
val_accuracy: 0.9527687430381775

Best val_accuracy So Far: 0.9552116990089417
Total elapsed time: 00h 58m 17s

Search: Running Trial #502

Value             |Best Value So Far |Hyperparameter
relu              |tanh              |activation
16                |26                |first_units
2                 |1                 |num_layers
1                 |21                |units_0
6                 |26                |units_1
6                 |16                |units_2
26                |16                |units_3
16                |11                |units_4
100               |34                |tuner/epochs
34                |12                |tuner/initial_epoch
1                 |3                 |tuner/bracket
1                 |2                 |tuner/round
0491              |0190              |tuner/trial_id

Epoch 35/100


In [29]:
top_hyper = tuner.get_best_hyperparameters(3)
for param in top_hyper:
    print(param.values)

{'activation': 'tanh', 'first_units': 26, 'num_layers': 1, 'units_0': 21, 'units_1': 26, 'units_2': 16, 'units_3': 16, 'units_4': 11, 'tuner/epochs': 34, 'tuner/initial_epoch': 12, 'tuner/bracket': 3, 'tuner/round': 2, 'tuner/trial_id': '0190'}
{'activation': 'tanh', 'first_units': 26, 'num_layers': 1, 'units_0': 21, 'units_1': 26, 'units_2': 16, 'units_3': 16, 'units_4': 11, 'tuner/epochs': 100, 'tuner/initial_epoch': 34, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0204'}
{'activation': 'tanh', 'first_units': 26, 'num_layers': 4, 'units_0': 11, 'units_1': 1, 'units_2': 6, 'units_3': 21, 'units_4': 21, 'tuner/epochs': 34, 'tuner/initial_epoch': 12, 'tuner/bracket': 4, 'tuner/round': 3, 'tuner/trial_id': '0385'}


In [30]:
top_model = tuner.get_best_models(3)
for model in top_model:
    model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

39/39 - 0s - loss: 0.1579 - accuracy: 0.9552 - 284ms/epoch - 7ms/step
Loss: 0.15790800750255585, Accuracy: 0.9552116990089417
39/39 - 0s - loss: 0.1623 - accuracy: 0.9552 - 292ms/epoch - 7ms/step
Loss: 0.16231179237365723, Accuracy: 0.9552116990089417
39/39 - 0s - loss: 0.1603 - accuracy: 0.9552 - 317ms/epoch - 8ms/step
Loss: 0.16028860211372375, Accuracy: 0.9552116990089417


In [None]:
first_hyper = tuner.get_best_hyperparameters(2)[0]
first_hyper.values

In [None]:
second_model = tuner.get_best_models(2)[0]
model_loss, model_accuracy = second_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

# Using RandomOverSampler


In [22]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler
# Instantiate the random oversampler model
ros = RandomOverSampler(random_state=1)
# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [23]:
y_resampled.value_counts()

0    3530
1    3530
Name: stroke, dtype: int64

In [24]:
ros_model = tf.keras.models.Sequential()

ros_model.add(tf.keras.layers.Dense(units=21, activation="relu", input_dim=21))

ros_model.add(tf.keras.layers.Dense(units=21, activation="relu"))

ros_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the Sequential model
ros_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 21)                462       
                                                                 
 dense_4 (Dense)             (None, 21)                462       
                                                                 
 dense_5 (Dense)             (None, 1)                 22        
                                                                 
Total params: 946 (3.70 KB)
Trainable params: 946 (3.70 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
ros_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
ros_fit_model = ros_model.fit(X_resampled, y_resampled, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [26]:
model_loss, model_accuracy = ros_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

39/39 - 0s - loss: 4.1142 - accuracy: 0.8021 - 113ms/epoch - 3ms/step
Loss: 4.114222049713135, Accuracy: 0.8021172881126404


In [29]:
ros_predictions = ros_model.predict(X_test_scaled)
ros_predictions = np.round(ros_predictions, decimals = 0)
accuracy_score(y_test,ros_predictions)



0.8021172638436482

In [30]:
#confusion_matrix and classification report
confusion_matrix(y_test,ros_predictions)
cm = confusion_matrix(y_test, ros_predictions)
cm_df = pd.DataFrame( cm, index=['Actual No Stroke', "Actual Stroke"], columns = ['Predicted No Stroke', 'Predicted Stroke'] )
rf_acc_score = accuracy_score(y_test,ros_predictions)
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {rf_acc_score}")
print("Classification Report")
print(classification_report(y_test, ros_predictions))

Confusion Matrix


Unnamed: 0,Predicted No Stroke,Predicted Stroke
Actual No Stroke,966,204
Actual Stroke,39,19


Accuracy Score : 0.8021172638436482
Classification Report
              precision    recall  f1-score   support

           0       0.96      0.83      0.89      1170
           1       0.09      0.33      0.14        58

    accuracy                           0.80      1228
   macro avg       0.52      0.58      0.51      1228
weighted avg       0.92      0.80      0.85      1228

