## Best Model for Predicting Stroke -  

XGBoost for Classification is the best model for predicting Stroke. Below is the final code for this model which would be used hereafter for putting as a web service using Flask and local deployment using Docker.

In [1]:
# loading all the basic libraries:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import xgboost as xgb

Loading the Dataset - 

In [2]:
# importing the data from csv file into Pandas DataFrame:
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


Data Cleaning and Formatting -  

In [3]:
## Data Cleaning and Formatting -  
# formatting column names and row values to lower case:
data.columns = data.columns.str.lower()
data['smoking_status'] = data['smoking_status'].str.lower()
data['work_type'] = data['work_type'].str.lower()

# imputing missing values in BMI column with mean BMI values:
data['bmi'] = data['bmi'].fillna(np.mean(data['bmi']))

# dropping id column from dataset:
data.drop(columns=['id'], inplace=True)

# getting the mode of gender column:
gender_mode = list(data.gender.mode().values)[0]

# replacing the 'Other' gender category row to mode of gender column:
data['gender'] = data['gender'].replace('Other', gender_mode)

In [4]:
# Separating numerical variable columns and categorical variable columns:
numerical = ['age', 'avg_glucose_level', 'bmi']

# remaining columns are categorical variable columns:
categorical = ['gender','hypertension','heart_disease', 'ever_married', 'work_type', 
                        'residence_type','smoking_status']

Splitting the Data and getting the Feature Matrix & Target variables - 

In [5]:
## Splitting the Data &  getting the Feature Matrix & Target variables - 
# splitting the dataset using sklearn into 60-20-20:
# Step 1 - splitting dataset into full train and test subsets first:
df_full_train, df_test = train_test_split(data, test_size=0.2,random_state=1)

# Step 2 - splitting full train subset again into training set and validation set:
df_train, df_val = train_test_split(df_full_train, test_size=0.25,random_state = 1)

# Resetting indices for each of the subset: 
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Getting our target variable column ('stroke') subsets as respective Numpy arrays:
y_train = df_train.stroke.values
y_val = df_val.stroke.values
y_test = df_test.stroke.values

# deleting 'stroke' column from feature matrix subsets:
del df_train['stroke']
del df_val['stroke']
del df_test['stroke']

df_train.shape, df_val.shape, df_test.shape

((3066, 10), (1022, 10), (1022, 10))

Predicting on Test data using our Final Model (XGBoost for Classification) - 

In [6]:
## Predicting on Test data using our Final Model - 
# resetting indices of full_train DataFrame:
df_full_train = df_full_train.reset_index(drop=True)

# slicing the target variable column for full_train dataset:
y_full_train = (df_full_train.stroke).astype(int).values

# turning the full train df into dictionaries:
dicts_full_train = df_full_train.to_dict(orient='records')

# instantiating the vectorizer instance:
dv = DictVectorizer(sparse=False)

# turning list of dictionaries into full train feature matrix
X_full_train = dv.fit_transform(dicts_full_train)

# turning the test df into dictionaries:
dicts_test = df_test.to_dict(orient='records')

# turning list of dictionaries into testing feature matrix
X_test = dv.transform(dicts_test)

# converting full train and test matrices into DMatrix datastructure for using in XGBoost model:
dfulltrain = xgb.DMatrix(X_full_train, label = y_full_train, feature_names = dv.get_feature_names())
dtest = xgb.DMatrix(X_test, feature_names = dv.get_feature_names())

In [7]:
xgb_params = {'eta': 0.1, 
              'max_depth': 3, 
              'min_child_weight': 20,
             'objective': 'binary:logistic',
              'eval_metric':'auc',
              
             'nthread': 8,
             'seed': 1,
             'verbosity': 1}


# training our best model XGBoost on our full train set:
model = xgb.train(xgb_params, dfulltrain, num_boost_round=200) 

In [8]:
# predicting the XGBoost model on the testing set:
y_pred = model.predict(dtest)

# computing the AUC score on testing set:
print('AUC on test set: %.3f' % roc_auc_score(y_test, y_pred))

AUC on test set: 0.852


#### Using KFold Cross-Validation on our Final Model for making Predictions - 

(making 5-fold cross-validation)

In [9]:
# Step 1 -
# Function 1 - Creating a function to train our DataFrame:
def train(df_train, y_train):
    dicts = df_train[categorical + numerical].to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
    
    # converting full train and test matrices into DMatrix datastructure for using in XGBoost model:
    dtrain = xgb.DMatrix(X_train, label = y_train, feature_names = dv.get_feature_names())
    model = xgb.train(xgb_params,dtrain, num_boost_round=200)     
    
    return dv, model

In [10]:
# Step 2 - 
# Function 2 - Creating another function to predict:
def predict(df, dv, model):
    dicts = df[categorical + numerical].to_dict(orient='records')  # converts df to list of dictionaries
    
    X = dv.transform(dicts)  # creates a feature matrix using the vectorizer
    
    X_Dmat = xgb.DMatrix(X, feature_names = dv.get_feature_names())
    y_pred = model.predict(X_Dmat)  # uses the model
    
    return y_pred

In [11]:
# specifying the number of folds to be used:
n_splits = 5

In [12]:
# Performing K-fold Cross validation and evaluating the AUC scores after each iteration:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []
        
for train_idx, val_idx in kfold.split(df_full_train):
        
    # Selecting part of dataset as 3 subsets for model:
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.stroke.values   # our target variable values as Numpy array for train and validation sets
    y_val = df_val.stroke.values

    dv, model = train(df_train, y_train)   # using train function created
    y_pred = predict(df_val, dv, model)   # using predict function created

    # compute auc scores for each iteration or fold in KFold:
    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)
        
# Computing mean of AUC scores and spread of AUC score:
print('%.3f +- %.3f' % (np.mean(scores), np.std(scores)))

0.833 +- 0.023


In [13]:
# printing the listing of AUC scores in each fold:
scores

[0.8375210066811494,
 0.8320853887714352,
 0.8705228916496522,
 0.8250456050266874,
 0.7988926240854262]

In [14]:
# Now, Training our Final Model on Full train dataset and evaluating on test dataset -
dv, model = train(df_full_train, df_full_train.stroke.values)   # using train function created
y_pred = predict(df_test, dv, model)   # using predict function created

# compute auc for ROC Curve:
auc = roc_auc_score(y_test, y_pred)
auc

0.8430873180873182

#### Saving the Model -

In [15]:
import pickle

In [16]:
# Step 1 - taking our model and writing it to a file - 
# creating a file where we'll write it:
output_file = f'model.bin'                  
output_file

'model.bin'

In [17]:
# write a Binary file using pickle - alternative to open and close codes we use with open to automatically open-close a file:
with open(output_file, 'wb') as f_out:    # file output
    pickle.dump((dv, model), f_out)

#### Loading the Model - 

In [18]:
import pickle

In [19]:
# creating a variable with our model file:
input_file = 'model.bin'

# loads our model file: 
with open(input_file, 'rb') as f_in:    # file input; rb - used to read the file
    dv, model = pickle.load(f_in)     # load() function reads from the file

In [20]:
model

<xgboost.core.Booster at 0x27e98086580>

In [21]:
# Preparing a sample person's stroke-related details (to test and predict our model on unseen person's details) - 
sample_person = {'age': 75.0,
 'avg_glucose_level': 170.01,
 'bmi': 35.5,
 'gender': 'Male',
 'hypertension': 0,
 'heart_disease': 1,
 'ever_married': 'Yes',
 'work_type': 'private',
 'residence_type': 'Rural',
 'smoking_status': 'smokes'}

In [22]:
# transforming the sample person's feature details into a dictionary using DictVectorizer:
X = dv.transform([sample_person])

In [23]:
# converting the transformed feature matrix of sample person to DMatrix for use in XGBoost model:
X_Dm = xgb.DMatrix(X, feature_names = dv.get_feature_names())

# make prediction on sample person using our model: 
y_pred = model.predict(X_Dm)  # By default, the predictions made by XGBoost are probabilities.

In [24]:
print('input:', sample_person)
print('output:', float(y_pred))  

input: {'age': 75.0, 'avg_glucose_level': 170.01, 'bmi': 35.5, 'gender': 'Male', 'hypertension': 0, 'heart_disease': 1, 'ever_married': 'Yes', 'work_type': 'private', 'residence_type': 'Rural', 'smoking_status': 'smokes'}
output: 0.3430485129356384


In [25]:
# stroke decision by specifying the threshold:
stroke = float(y_pred) >= 0.55 

Making requests

In [26]:
#import requests

In [28]:
sample_person = {'age': 75.0,
 'avg_glucose_level': 170.01,
 'bmi': 35.5,
 'gender': 'Male',
 'hypertension': 0,
 'heart_disease': 1,
 'ever_married': 'Yes',
 'work_type': 'private',
 'residence_type': 'Rural',
 'smoking_status': 'smokes'}

In [31]:
# converting the sample person's feature details into json format (dictionaries):
#requests.post(url,json =sample_person)

In [32]:
#response = requests.post(url,json=sample_person).json()
#response

In [33]:
# sending a promo if response is risk of stroke:
#if response['stroke'] == True:
 #   print('person will have stroke %s' % ('xyz-123'))