## Preprocessing

In [1]:
# Import findspark and initialize. 
import findspark
findspark.init()

In [2]:
# Start Spark session
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DataFrame Basics").getOrCreate()

In [3]:
from pyspark import SparkFiles
#Read in csv to pyspark dataframe
path = "healthcare-dataset-stroke-data.csv"
spark.sparkContext.addFile(path)
df = spark.read.csv(SparkFiles.get("healthcare-dataset-stroke-data.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+-----+------+---+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender|age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+---+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male| 67|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female| 61|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
|31112|  Male| 80|           0|            1|         Yes|      Private|         Rural|           105.92|32.5|   never smoked|     1|
|60182|Female| 49|           0|            0|         Yes|      Private|         Urban|           171.23|34.4|         smokes|     1|
| 1665|Female| 79|           1|            0|         Yes|Self

In [4]:
#Use pyspark flitering to drop na values
df = df.filter("bmi != 'N/A'")


df.show()

+-----+------+---+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender|age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+---+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male| 67|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|31112|  Male| 80|           0|            1|         Yes|      Private|         Rural|           105.92|32.5|   never smoked|     1|
|60182|Female| 49|           0|            0|         Yes|      Private|         Urban|           171.23|34.4|         smokes|     1|
| 1665|Female| 79|           1|            0|         Yes|Self-employed|         Rural|           174.12|  24|   never smoked|     1|
|56669|  Male| 81|           0|            0|         Yes|    

In [5]:
# Original model used 'ever_married' column

#Use pyspark to drop id column
# df = df.select(['gender','age','hypertension','heart_disease','ever_married','work_type','Residence_type','avg_glucose_level','bmi','smoking_status','stroke'])

# df.show()

In [6]:
#Use pyspark to drop id and ever_married column
df = df.select(['gender','age','hypertension','heart_disease','work_type','Residence_type','avg_glucose_level','bmi','smoking_status','stroke'])

df.show()

+------+---+------------+-------------+-------------+--------------+-----------------+----+---------------+------+
|gender|age|hypertension|heart_disease|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+------+---+------------+-------------+-------------+--------------+-----------------+----+---------------+------+
|  Male| 67|           0|            1|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|  Male| 80|           0|            1|      Private|         Rural|           105.92|32.5|   never smoked|     1|
|Female| 49|           0|            0|      Private|         Urban|           171.23|34.4|         smokes|     1|
|Female| 79|           1|            0|Self-employed|         Rural|           174.12|  24|   never smoked|     1|
|  Male| 81|           0|            0|      Private|         Urban|           186.21|  29|formerly smoked|     1|
|  Male| 74|           1|            1|      Private|         Rural|            

In [7]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Convert the pyspark dataframe to pandas dataframe for machine learning model
import pandas as pd 
risk_factors_df = df.toPandas()
risk_factors_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67,0,1,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80,0,1,Private,Rural,105.92,32.5,never smoked,1
2,Female,49,0,0,Private,Urban,171.23,34.4,smokes,1
3,Female,79,1,0,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81,0,0,Private,Urban,186.21,29.0,formerly smoked,1


In [8]:
# Determine the number of unique values in each column.
risk_factors_df.nunique()

gender                  3
age                   104
hypertension            2
heart_disease           2
work_type               5
Residence_type          2
avg_glucose_level    3852
bmi                   418
smoking_status          4
stroke                  2
dtype: int64

In [9]:
#ever_married column included in original dummies

# # Convert categorical data to numeric with `pd.get_dummies`
# factor_dummies = pd.get_dummies(risk_factors_df[['gender','work_type', 'ever_married','Residence_type','smoking_status']])
# #concat the dummies data onto the original dataframe
# risk_factors_df = pd.concat([risk_factors_df, factor_dummies], axis=1)
# #Drop original columns that have been converted to binary classification columns
# risk_factors_df = risk_factors_df.drop(columns=['gender','work_type','ever_married','Residence_type','smoking_status'])

# risk_factors_df.head()

In [10]:
# Convert categorical data to numeric with `pd.get_dummies` without inclusion of ever_married column
factor_dummies = pd.get_dummies(risk_factors_df[['gender','work_type','Residence_type','smoking_status']])
#concat the dummies data onto the original dataframe
risk_factors_df = pd.concat([risk_factors_df, factor_dummies], axis=1)
#Drop original columns that have been converted to binary classification columns
risk_factors_df = risk_factors_df.drop(columns=['gender','work_type','Residence_type','smoking_status'])

risk_factors_df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67,0,1,228.69,36.6,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0
1,80,0,1,105.92,32.5,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0
2,49,0,0,171.23,34.4,1,1,0,0,0,0,1,0,0,0,1,0,0,0,1
3,79,1,0,174.12,24.0,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0
4,81,0,0,186.21,29.0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0


In [11]:
#Checking data type from converted dataframe
risk_factors_df.dtypes

age                               object
hypertension                      object
heart_disease                     object
avg_glucose_level                 object
bmi                               object
stroke                            object
gender_Female                      uint8
gender_Male                        uint8
gender_Other                       uint8
work_type_Govt_job                 uint8
work_type_Never_worked             uint8
work_type_Private                  uint8
work_type_Self-employed            uint8
work_type_children                 uint8
Residence_type_Rural               uint8
Residence_type_Urban               uint8
smoking_status_Unknown             uint8
smoking_status_formerly smoked     uint8
smoking_status_never smoked        uint8
smoking_status_smokes              uint8
dtype: object

In [12]:
#Fixing object data types
risk_factors_df[['age','hypertension','heart_disease','avg_glucose_level','bmi','stroke']] = risk_factors_df[['age','hypertension','heart_disease','avg_glucose_level','bmi','stroke']].astype('float')

In [13]:
#Verify Correct data types
risk_factors_df.dtypes

age                               float64
hypertension                      float64
heart_disease                     float64
avg_glucose_level                 float64
bmi                               float64
stroke                            float64
gender_Female                       uint8
gender_Male                         uint8
gender_Other                        uint8
work_type_Govt_job                  uint8
work_type_Never_worked              uint8
work_type_Private                   uint8
work_type_Self-employed             uint8
work_type_children                  uint8
Residence_type_Rural                uint8
Residence_type_Urban                uint8
smoking_status_Unknown              uint8
smoking_status_formerly smoked      uint8
smoking_status_never smoked         uint8
smoking_status_smokes               uint8
dtype: object

In [14]:
# Split our preprocessed data into our features and target arrays
y = risk_factors_df.stroke.values
X = risk_factors_df.drop(columns="stroke").values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [15]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [16]:
#Initial model structure.

# stroke_model = tf.keras.models.Sequential()
# # First hidden layer
# stroke_model.add(tf.keras.layers.Dense(units=32, activation="relu", input_dim=21))

# # Second hidden layer
# stroke_model.add(tf.keras.layers.Dense(units=32, activation="relu"))

# # Third hidden layer
# stroke_model.add(tf.keras.layers.Dense(units=32, activation="relu"))

# # Output layer - SIGMOID as the output layer, as we are classifying into a binary classification type
# stroke_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

#Model metrics: 39/39 - 0s - loss: 0.4496 - accuracy: 0.9251 - 114ms/epoch - 3ms/step
#               Loss: 0.4496151804924011, Accuracy: 0.9250814318656921

In [17]:
#Secondary model structure not including ever_married column.

# stroke_model = tf.keras.models.Sequential()
# # First hidden layer
# stroke_model.add(tf.keras.layers.Dense(units=32, activation="relu", input_dim=19))

# # Second hidden layer
# stroke_model.add(tf.keras.layers.Dense(units=32, activation="relu"))

# # Third hidden layer
# stroke_model.add(tf.keras.layers.Dense(units=32, activation="relu"))

# # Output layer - SIGMOID as the output layer, as we are classifying into a binary classification type
# stroke_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

#Model Metrics
# 39/39 - 0s - loss: 0.3882 - accuracy: 0.9259 - 113ms/epoch - 3ms/step
# Loss: 0.38824722170829773, Accuracy: 0.9258957505226135

In [18]:
#New model structure. Loss and accuracy improved with one less hidden layer and lower number of neurons

stroke_model = tf.keras.models.Sequential()

# First hidden layer
stroke_model.add(tf.keras.layers.Dense(units=16, activation="relu", input_dim=19))

# Second hidden layer
stroke_model.add(tf.keras.layers.Dense(units=16, activation="relu"))

# Output layer - SIGMOID as the output layer, as we are classifying into a binary classification type
stroke_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
stroke_model.summary()

#Model Metrics
# 39/39 - 0s - loss: 0.1767 - accuracy: 0.9495 - 108ms/epoch - 3ms/step
# Loss: 0.1767115592956543, Accuracy: 0.9495114088058472

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 16)                320       
                                                                 
 dense_1 (Dense)             (None, 16)                272       
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 609 (2.38 KB)
Trainable params: 609 (2.38 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
# Compile the model
stroke_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [20]:
#Original fit had the model tested at 100 epochs.
# # Train the model
# fit_model = stroke_model.fit(X_train_scaled, y_train, epochs=100)

In [21]:
# # Train the model
# fit_model = stroke_model.fit(X_train_scaled, y_train, epochs=250)

In [22]:
# Train the model - Model accuracy begins regressing past 100 epochs
fit_model = stroke_model.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [23]:
# Evaluate the model using the test data
model_loss, model_accuracy = stroke_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

39/39 - 0s - loss: 0.1824 - accuracy: 0.9511 - 114ms/epoch - 3ms/step
Loss: 0.18236640095710754, Accuracy: 0.9511400461196899


In [24]:
# # Export our model to HDF5 file
# stroke_model.save("stroke_status_trained.h5")