### Stroke prediction dataset

Kei Sato

March, 2021

#### Project overview
For this project, we want to use labeled data from patients to predict if the person will have a stroke.  To build the model, I am using a data set of 5,110 records, with the following attributes: gender, age, if they have hypertension, if they have heart disease, ever married, work type, residence type, average glucose level, bmi, and smoking status.  Each data point is assigned a score of 0 or 1 to indicate, respectively, whether or not that person had a stroke.
 

#### Metrics used
We will use accuracy as the main metric used to determine if the model is successful.  But, throughout the model training and cross validation, the proportion of false positives for both classes will be monitored.

In [60]:
import pandas as pd
import numpy as np

data = pd.read_csv('resources/stroke-data.csv')
print(data.shape)
data.head()

data['stroke'].value_counts()
print(np.mean(data['avg_glucose_level'].values))

data.head()

data = data.rename(columns={"Residence_type": "residence_type"})
data.head()

(5110, 12)
106.1476771037182


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [61]:
# split/prep data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

test_data = data.copy(deep=True)

# for label in ["gender", "ever_married", "work_type", "residence_type", "smoking_status"]:
#     test_data[label] = LabelEncoder().fit(test_data[label].values).transform(test_data[label])

encoders = {
    "gender": LabelEncoder().fit(test_data["gender"].values),
    "ever_married": LabelEncoder().fit(test_data["ever_married"].values),
    "work_type": LabelEncoder().fit(test_data["work_type"].values),
    "residence_type": LabelEncoder().fit(test_data["residence_type"].values),
    "smoking_status": LabelEncoder().fit(test_data["smoking_status"].values)
}

for label in ["gender", "ever_married", "work_type", "residence_type", "smoking_status"]:
    test_data[label] = encoders[label].transform(test_data[label])
    
    
# account for nan in BMI column
bmi_avg = test_data["bmi"].mean()
test_data["bmi"] = test_data["bmi"].replace(np.nan, bmi_avg)
    
x_train, x_test, y_train, y_test = train_test_split(
    test_data.drop(["id", "stroke"], axis=1),
    test_data["stroke"],
    test_size=0.1
)

x_train.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status
2544,1,71.0,0,1,1,2,1,215.72,39.2,3
4565,0,56.0,0,0,1,3,1,98.14,32.7,1
226,0,81.0,0,0,1,2,0,184.4,27.5,2
1570,0,24.0,0,0,0,0,1,63.4,20.3,3
2363,0,64.0,0,0,1,3,0,128.04,34.0,3


In [62]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=5, weights="distance")
neigh.fit(x_train, y_train)

neigh.score(x_test, y_test)

0.9549902152641878

In [63]:
import pickle

pickle.dump(neigh, open('serialized/model.sav', 'wb'))
pickle.dump(encoders, open('serialized/encoder.sav', 'wb'))

In [45]:
encoders["gender"].transform(["Male"])



array([1])

In [64]:
# 9046 	Male 	67.0 	0 	1 	Yes 	Private 	Urban 	228.69 	36.6 	formerly smoked 	1

params = np.array([
    encoders["gender"].transform(["Male"]),
    67,
    0,
    2,
    encoders["ever_married"].transform(["Yes"]),
    encoders["work_type"].transform(["Private"]),
    encoders["residence_type"].transform(["Urban"]),
    228,
    36.6,
    encoders["smoking_status"].transform(["formerly smoked"])
]).flatten().astype(np.float32)

print(params)

neigh.predict_proba([params])

[  1.   67.    0.    2.    1.    2.    1.  228.   36.6   1. ]


array([[0.50172806, 0.49827194]])

In [65]:
pickledmodel = pickle.load(open('serialized/model.sav', 'rb'))
#     encoder = pickle.load(open('../model/?serialized/encoder.sav', 'rb'))

params = np.array([
    encoders["gender"].transform(["Male"]),
    67,
    0,
    2,
    encoders["ever_married"].transform(["Yes"]),
    encoders["work_type"].transform(["Private"]),
    encoders["residence_type"].transform(["Urban"]),
    228,
    36.6,
    encoders["smoking_status"].transform(["formerly smoked"])
]).flatten().astype(np.float32)

result = model.predict_proba([params])

print(result)

[[1. 0.]]
