# Import Libraries



In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import plotly.express as px
import numpy as np

from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import plot_precision_recall_curve, precision_recall_curve
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score

# Load Data



In [2]:
stroke_df = pd.read_csv("data/stroke-data.csv")

In [3]:
stroke_df.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [5]:
stroke_df.nunique()

id                   5110
gender                  3
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3979
bmi                   418
smoking_status          4
stroke                  2
dtype: int64

## Split Data

Training and testing data must be separated to prevent modification of the testing data.


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    stroke_df.drop("stroke", axis=1, inplace=False),
    stroke_df["stroke"],
    test_size = 0.2, #a small test size proportion was used because the dataset is small
    stratify = stroke_df["stroke"], #preserves the proportion of target classes among both splits
    random_state = 123
)

In [7]:
train_df = X_train
train_df["stroke"] = y_train

While null values can be imputed in the training dataset, they must be dropped in the test set



In [8]:
test_df = X_test
test_df["stroke"] = y_test
test_df = test_df.dropna()

In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4088 entries, 795 to 2819
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4088 non-null   int64  
 1   gender             4088 non-null   object 
 2   age                4088 non-null   float64
 3   hypertension       4088 non-null   int64  
 4   heart_disease      4088 non-null   int64  
 5   ever_married       4088 non-null   object 
 6   work_type          4088 non-null   object 
 7   Residence_type     4088 non-null   object 
 8   avg_glucose_level  4088 non-null   float64
 9   bmi                3923 non-null   float64
 10  smoking_status     4088 non-null   object 
 11  stroke             4088 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 415.2+ KB


# Univariate Analysis

Individual distributions of data are analyzed to aid in decisions related to handling null values and outliers.

Two lists of numerical and categorical columns are created with column names and title names to make it easy to visualize distributions with a for\-loop.


In [10]:
numerical_columns = [["age", "Age"], ["avg_glucose_level", "Average Glucose Level"], ["bmi", "BMI"]]
categorical_columns = [
    ["gender","Gender"],["hypertension","Hypertension"],["heart_disease","Heart Disease"],
    ["ever_married","Ever Married"],["work_type","Work Type"],["Residence_type","Residence Type"],
    ["smoking_status","Smoking Status"],["stroke","Stroke"]
]

In [11]:
for column, title in numerical_columns:
    fig = px.histogram(train_df, column)
    fig.update_layout(
        title = f"{title} Histogram",
        title_x = 0.5,
        width = 450, height = 300,
    )
    fig.show()

In [12]:
for column, title in numerical_columns:
    fig = px.box(train_df, column)
    fig.update_layout(
        title = f"{title} Box Plot",
        title_x = 0.5,
        width = 450, height = 300,
    )
    fig.show()

In [13]:
for column, title in categorical_columns:
    fig = px.histogram(train_df, column)
    fig.update_layout(
        title = f"{title} Bar Chart",
        title_x = 0.5,
        width = 450, height = 300,
    )
    if train_df[column].dtype == 'int64':
        fig.update_layout(xaxis={"tickvals":[0, 1]})
    fig.show()

In [14]:
fig = px.histogram(train_df, "stroke")
fig.update_layout(
    title = "Stroke Bar Chart",
    title_x = 0.5,
    width = 450, height = 300,
)
fig.update_layout(xaxis={"tickvals":[0, 1]})
#fig.write_html("stroke_bar.html")
fig.show()

## Null Value Handling

The "BMI" column was the only feature with null values. Based on the distribution of this feature, it was decided that the mean was a good measure of central tendency. Therefore, the mean was used to impute missing values for "BMI".



In [15]:
train_df["bmi"].fillna(train_df["bmi"].mean(), inplace = True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4088 entries, 795 to 2819
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4088 non-null   int64  
 1   gender             4088 non-null   object 
 2   age                4088 non-null   float64
 3   hypertension       4088 non-null   int64  
 4   heart_disease      4088 non-null   int64  
 5   ever_married       4088 non-null   object 
 6   work_type          4088 non-null   object 
 7   Residence_type     4088 non-null   object 
 8   avg_glucose_level  4088 non-null   float64
 9   bmi                4088 non-null   float64
 10  smoking_status     4088 non-null   object 
 11  stroke             4088 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 415.2+ KB


## Outlier Value Handling
While the columns "avg_glucose_level" and "bmi" had outliers, the values of these outliers were not determined to be errors in measurement. Therefore, they were not removed or changed.

# Bivariate Analysis
The relationship between features and the target column, "stroke", are analyzed

In [16]:
for column, title in numerical_columns:
    fig = px.box(train_df, x = "stroke", y = column)
    fig.update_layout(
        title = f"Distribution of {title} vs Stroke",
        title_x = 0.5,
        width = 550, height = 350,
    )    
    fig.show()

In [17]:
for column, title in numerical_columns:
    fig = px.histogram(train_df, x = column, color = "stroke")
    fig.update_layout(
        title = f"Distribution of {title} vs Stroke",
        title_x = 0.5,
        width = 550, height = 350,
    )    
    fig.show()

In [18]:
for column, title in categorical_columns:
    if column != "stroke":
        fig = px.histogram(train_df, x = column, color = "stroke")
        fig.update_layout(
            title = f"Distribution of {title} and Stroke",
            title_x = 0.5,
            width = 600, height = 350,
        )
        if train_df[column].dtype == 'int64':
            fig.update_layout(xaxis={"tickvals":[0, 1]})
        fig.show()

In [19]:
gender_stroke_df = pd.crosstab(train_df["gender"],train_df["stroke"])
gender_stroke_df

stroke,0,1
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,2279,115
Male,1609,84
Other,1,0


In [20]:
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Male", "Female"),
    specs=[[{'type':'domain'}, {'type':'domain'}]],
    horizontal_spacing=0.01
)
fig.add_trace(go.Pie(labels=["No Stroke", "Had Stroke"], values=[1609, 84]), 1, 1)
fig.add_trace(go.Pie(labels=["No Stroke", "Had Stroke"], values=[2279, 115]), 1, 2)
fig.update_layout(
    width=800,
    height=400,
    title="Stroke and Gender",
    title_x = 0.1
)
fig['layout']['annotations'][0]['y'] = 1.08
fig['layout']['annotations'][1]['y'] = 1.08

#fig.write_html("strokes_gender.html")
fig.show()

In [21]:
married_stroke_df = pd.crosstab(train_df["ever_married"],train_df["stroke"])
married_stroke_df

stroke,0,1
ever_married,Unnamed: 1_level_1,Unnamed: 2_level_1
No,1372,23
Yes,2517,176


In [22]:
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Not Married", "Married"),
    specs=[[{'type':'domain'}, {'type':'domain'}]],
    horizontal_spacing=0.01
)
fig.add_trace(go.Pie(labels=["No Stroke", "Had Stroke"], values=[1372, 23]), 1, 1)
fig.add_trace(go.Pie(labels=["No Stroke", "Had Stroke"], values=[2517, 176]), 1, 2)
fig.update_layout(
    width=800,
    height=400,
    title="Stroke and Marriage",
    title_x = 0.1
)
fig['layout']['annotations'][0]['y'] = 1.08
fig['layout']['annotations'][1]['y'] = 1.08

#fig.write_html("strokes_married.html")
fig.show()

In [23]:
fig = px.scatter(train_df, x="age", y="avg_glucose_level",color="stroke", color_continuous_scale=[(0,"orange"),(1,"blue")])
fig.update_layout(
    width=500,
    height=400,
    title="Average Glucose and Age vs Stroke",
    title_x = 0.15
)
fig.update_coloraxes(showscale=False)

#fig.write_html("glucose_age_stroke.html")
fig.show()

In [24]:
age_stroke_df = train_df[["age","stroke"]]
age_stroke_df = age_stroke_df.sort_values(by="age")
ages = np.arange(83, dtype=int)
stroke_per_age = np.zeros(83, dtype=np.int)
total_per_age = np.zeros(83, dtype=np.int)
for age, stroke in zip(age_stroke_df["age"], age_stroke_df["stroke"]):
    total_per_age[int(age)] += 1
    if stroke == 1:
        stroke_per_age[int(age)] += 1
proportion_per_age = np.divide(stroke_per_age, total_per_age)
age_prop_df = pd.DataFrame(np.array([ages, proportion_per_age]).T, columns=["ages", "proportion_stroke"])
age_prop_df.head(5)

Unnamed: 0,ages,proportion_stroke
0,0.0,0.0
1,1.0,0.016667
2,2.0,0.0
3,3.0,0.0
4,4.0,0.0


In [25]:
fig = px.scatter(age_prop_df, x="ages",y="proportion_stroke", trendline="lowess")
fig.update_layout(
    width=500,
    height=400,
    title="Proportion of Stroke vs Age",
    title_x = 0.2
)
#fig.write_html("stroke_age_proportion.html")
fig.show()

## Categorical Variable Encoding
Categorical variables must be encoded numerically because most models can only make computations with numerical data.

In [26]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4088 entries, 795 to 2819
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4088 non-null   int64  
 1   gender             4088 non-null   object 
 2   age                4088 non-null   float64
 3   hypertension       4088 non-null   int64  
 4   heart_disease      4088 non-null   int64  
 5   ever_married       4088 non-null   object 
 6   work_type          4088 non-null   object 
 7   Residence_type     4088 non-null   object 
 8   avg_glucose_level  4088 non-null   float64
 9   bmi                4088 non-null   float64
 10  smoking_status     4088 non-null   object 
 11  stroke             4088 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 415.2+ KB


In [27]:
train_df.nunique()

id                   4088
gender                  3
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3332
bmi                   400
smoking_status          4
stroke                  2
dtype: int64

In [28]:
train_df["ever_married"].value_counts()

Yes    2693
No     1395
Name: ever_married, dtype: int64

In [29]:
train_df["Residence_type"].value_counts()

Urban    2076
Rural    2012
Name: Residence_type, dtype: int64

In [30]:
train_df["gender"].value_counts()

Female    2394
Male      1693
Other        1
Name: gender, dtype: int64

### Label Encoding
Binary classes are label encoded with 0's and 1's.

In [31]:
train_df["ever_married"] = train_df["ever_married"].replace({"Yes":1, "No":0})
test_df["ever_married"] = test_df["ever_married"].replace({"Yes":1, "No":0})
train_df["Residence_type"] = train_df["Residence_type"].replace({"Urban":1, "Rural":0})
test_df["Residence_type"] = test_df["Residence_type"].replace({"Urban":1, "Rural":0})
train_df = train_df[train_df["gender"] != "Other"]
train_df["gender"] = train_df["gender"].replace({"Female":1, "Male":0})
test_df["gender"] = test_df["gender"].replace({"Female":1, "Male":0})

### One-Hot Encoding
Nominal categorical data is encoded with one-hot encoding. Also, the "id" column was not found to be useful, so it was removed.

In [32]:
train_df = train_df.drop("id", axis=1, inplace=False)
test_df = test_df.drop("id", axis=1, inplace=False)
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

In [33]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4087 entries, 795 to 2819
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gender                          4087 non-null   int64  
 1   age                             4087 non-null   float64
 2   hypertension                    4087 non-null   int64  
 3   heart_disease                   4087 non-null   int64  
 4   ever_married                    4087 non-null   int64  
 5   Residence_type                  4087 non-null   int64  
 6   avg_glucose_level               4087 non-null   float64
 7   bmi                             4087 non-null   float64
 8   stroke                          4087 non-null   int64  
 9   work_type_Govt_job              4087 non-null   uint8  
 10  work_type_Never_worked          4087 non-null   uint8  
 11  work_type_Private               4087 non-null   uint8  
 12  work_type_Self-employed         

## Feature Selection
Features are analyzed for redundancy to prevent bias when training the model. According to our correlation heatmap, no two features were redundant. Therefore, no features were removed.

In [34]:
fig = px.imshow(
    train_df.corr(),
    text_auto=False,
    range_color=(-1,1),
    color_continuous_scale="RdBu"
)
fig.update_layout(
    title="Stroke Data Correlation Heatmap",
    title_x = 0.2,
    width=1000,
    height=800
)
fig['layout']['title']['font'] = {"size":20}
fig.update
fig.show()

In [35]:
corr_matrix = train_df.corr()
corr_matrix_res = corr_matrix[abs(corr_matrix["stroke"]) > 0.08]
indices = corr_matrix_res.index
drop = []
for value in corr_matrix.columns:
    if value not in indices:
        drop.append(value)
corr_matrix_res = corr_matrix_res.drop(labels=drop,axis=1)

In [36]:
fig = px.imshow(
    corr_matrix_res,
    text_auto=True,
    range_color=(-1,1),
    color_continuous_scale="RdBu"
)
fig.update_layout(
    title="Stroke Data Correlation Heatmap",
    title_x = 0.2,
    width=700,
    height=600
)
fig['layout']['title']['font'] = {"size":20}
#fig.write_html("heatmap.html")
fig.show()

Data is split into input features 'X_' and the output feature 'y_'

In [37]:
X_train = train_df.drop("stroke", axis=1, inplace=False)
X_test = test_df.drop("stroke", axis=1, inplace=False)

y_train = train_df["stroke"]
y_test = test_df["stroke"]

### Feature Scaling
The data is scaled to prevent models from being biased to learning features with high-magnitude values.

In [38]:
scaler = StandardScaler()
X_train = pd.DataFrame(data = scaler.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(data = scaler.transform(X_test), columns = X_test.columns)

### Data Balancing
The training data target is highly imbalance, so a synthetic over-sampling technique, SMOTE, was used to balance the minority class.\
Balancing the minority class resulted in increased recall for the positive target, "had a stroke".

In [39]:
balance = SMOTE(random_state = 123)
X_train_res, y_train_res = balance.fit_resample(X_train, y_train.ravel())

# Modeling and Evaluation
By storing multiple models in a dictionary, a for-loop was used to quickly check for the best models to test.

In [40]:
models = {
    "Logistic Regression": LogisticRegression(),
    "SGD Classifier": SGDClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "K Neighbors": KNeighborsClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "Multi-Layer Perceptron": MLPClassifier(),
    "SVC": SVC(),
    "Extra Trees Classifier": ExtraTreesClassifier(),
    "Ada Boost Classifier": AdaBoostClassifier()
}

for key, model in models.items():
    print(key)
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print()

Logistic Regression
              precision    recall  f1-score   support

           0       0.99      0.74      0.85       943
           1       0.12      0.77      0.21        43

    accuracy                           0.74       986
   macro avg       0.55      0.75      0.53       986
weighted avg       0.95      0.74      0.82       986


SGD Classifier
              precision    recall  f1-score   support

           0       0.99      0.74      0.84       943
           1       0.12      0.77      0.20        43

    accuracy                           0.74       986
   macro avg       0.55      0.75      0.52       986
weighted avg       0.95      0.74      0.82       986


Decision Tree Classifier
              precision    recall  f1-score   support

           0       0.97      0.90      0.93       943
           1       0.12      0.33      0.18        43

    accuracy                           0.87       986
   macro avg       0.55      0.61      0.55       986
weighted avg

              precision    recall  f1-score   support

           0       0.96      0.95      0.95       943
           1       0.08      0.09      0.08        43

    accuracy                           0.91       986
   macro avg       0.52      0.52      0.52       986
weighted avg       0.92      0.91      0.92       986


K Neighbors
              precision    recall  f1-score   support

           0       0.97      0.84      0.90       943
           1       0.09      0.35      0.14        43

    accuracy                           0.82       986
   macro avg       0.53      0.60      0.52       986
weighted avg       0.93      0.82      0.87       986


Gradient Boosting Classifier


              precision    recall  f1-score   support

           0       0.97      0.88      0.92       943
           1       0.12      0.37      0.18        43

    accuracy                           0.86       986
   macro avg       0.55      0.63      0.55       986
weighted avg       0.93      0.86      0.89       986


Multi-Layer Perceptron


              precision    recall  f1-score   support

           0       0.96      0.90      0.93       943
           1       0.10      0.26      0.15        43

    accuracy                           0.87       986
   macro avg       0.53      0.58      0.54       986
weighted avg       0.93      0.87      0.90       986


SVC


              precision    recall  f1-score   support

           0       0.98      0.82      0.89       943
           1       0.13      0.58      0.21        43

    accuracy                           0.81       986
   macro avg       0.55      0.70      0.55       986
weighted avg       0.94      0.81      0.86       986


Extra Trees Classifier


              precision    recall  f1-score   support

           0       0.96      0.94      0.95       943
           1       0.10      0.14      0.12        43

    accuracy                           0.91       986
   macro avg       0.53      0.54      0.53       986
weighted avg       0.92      0.91      0.92       986


Ada Boost Classifier


              precision    recall  f1-score   support

           0       0.97      0.82      0.89       943
           1       0.11      0.47      0.17        43

    accuracy                           0.81       986
   macro avg       0.54      0.64      0.53       986
weighted avg       0.93      0.81      0.86       986




## Logistic Regression

In [41]:
'''
model_lrr = LogisticRegression()
grid_params_lrr = {
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'penalty': ['none', 'elasticnet', 'l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
}
grid_lrr = GridSearchCV(model_lrr, grid_params_lrr, verbose=1, cv =5, scoring="recall")
grid_lrr.fit(X_train_res, y_train_res)
#'''
#Fitting 5 folds for each of 120 candidates, totalling 600 fits
#grid_lrr.best_params_
#{'C': 0.001, 'penalty': 'l1', 'solver': 'liblinear'}

'\nmodel_lrr = LogisticRegression()\ngrid_params_lrr = {\n    \'solver\': [\'newton-cg\', \'lbfgs\', \'liblinear\', \'sag\', \'saga\'],\n    \'penalty\': [\'none\', \'elasticnet\', \'l1\', \'l2\'],\n    \'C\': [0.001, 0.01, 0.1, 1, 10, 100],\n}\ngrid_lrr = GridSearchCV(model_lrr, grid_params_lrr, verbose=1, cv =5, scoring="recall")\ngrid_lrr.fit(X_train_res, y_train_res)\n#'

In [42]:
grid_lrr = LogisticRegression(C=0.001, penalty='l1', solver='liblinear')
grid_lrr.fit(X_train_res, y_train_res)

In [43]:
y_pred_lrr = grid_lrr.predict(X_test)

In [44]:
print(classification_report(y_test, y_pred_lrr))

              precision    recall  f1-score   support

           0       1.00      0.52      0.68       943
           1       0.08      0.98      0.16        43

    accuracy                           0.54       986
   macro avg       0.54      0.75      0.42       986
weighted avg       0.96      0.54      0.66       986



In [45]:
cf_matrix_lrr = confusion_matrix(y_test, y_pred_lrr)
fig = px.imshow(
    cf_matrix_lrr,
    text_auto=True,
    labels=dict(x="Predicted Values", y="Actual Values"),
    x=['No Stroke', 'Stroke'],
    y=['No Stroke', 'Stroke'],
    color_continuous_scale='mint'
)
fig.update_layout(
    title="LR Confusion Matrix",
    title_x = 0.5,
    font_size = 18,
    coloraxis_colorbar_x=1.1,
    width = 500,
    height = 400
)

#fig.write_html("cf_matrix_lr.html")
fig.show()

## Decision Tree Classifier

In [46]:
'''
model_dtcr = DecisionTreeClassifier()
grid_params_dtcr = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter' : ['best', 'random'],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2 , 4, 8, 16],
}
grid_dtcr = GridSearchCV(model_dtcr, grid_params_dtcr, cv=5, verbose=1, scoring="recall")
grid_dtcr.fit(X_train_res, y_train_res)
#'''
#Fitting 5 folds for each of 96 candidates, totalling 480 fits
#grid_dtcr.best_params_
#{'criterion': 'gini',
# 'max_depth': 10,
# 'min_samples_split': 2,
# 'splitter': 'best'}

'\nmodel_dtcr = DecisionTreeClassifier()\ngrid_params_dtcr = {\n    \'criterion\': [\'gini\', \'entropy\', \'log_loss\'],\n    \'splitter\' : [\'best\', \'random\'],\n    \'max_depth\': [5, 10, 15, 20],\n    \'min_samples_split\': [2 , 4, 8, 16],\n}\ngrid_dtcr = GridSearchCV(model_dtcr, grid_params_dtcr, cv=5, verbose=1, scoring="recall")\ngrid_dtcr.fit(X_train_res, y_train_res)\n#'

In [47]:
grid_dtcr = DecisionTreeClassifier(
    criterion='gini',
    max_depth=10,
    min_samples_split=2,
    splitter='best'
)
grid_dtcr.fit(X_train_res, y_train_res)

In [48]:
y_pred_dtcr = grid_dtcr.predict(X_test)

In [49]:
print(classification_report(y_test, y_pred_dtcr))

              precision    recall  f1-score   support

           0       0.97      0.81      0.89       943
           1       0.11      0.49      0.17        43

    accuracy                           0.80       986
   macro avg       0.54      0.65      0.53       986
weighted avg       0.93      0.80      0.85       986



In [50]:
cf_matrix_dtcr = confusion_matrix(y_test, y_pred_dtcr)
fig = px.imshow(
    cf_matrix_dtcr,
    text_auto=True,
    labels=dict(x="Predicted Values", y="Actual Values"),
    x=['No Stroke', 'Stroke'],
    y=['No Stroke', 'Stroke'],
    color_continuous_scale='mint'
)
fig.update_layout(
    title="DTC Confusion Matrix",
    title_x = 0.5,
    font_size = 18,
    coloraxis_colorbar_x=1.1,
    width = 500,
    height = 400
)
#fig.write_html("cf_matrix_dtc.html")
fig.show()

## Multi-Layer Perceptron

In [51]:
'''
model_mlpr = MLPClassifier()
grid_params_mlpr = {
    'activation':['logistic', 'tanh', 'relu'],
    'learning_rate' :['constant', 'invscaling', 'adaptive'],
}
grid_mlpr = GridSearchCV(model_mlpr, grid_params_mlpr, cv=5, verbose=1, scoring="recall")
grid_mlpr.fit(X_train_res, y_train_res)
#'''
#Fitting 5 folds for each of 9 candidates, totalling 45 fits
#grid_mlpr.best_params_
#{'activation': 'relu', 'learning_rate': 'adaptive'}

'\nmodel_mlpr = MLPClassifier()\ngrid_params_mlpr = {\n    \'activation\':[\'logistic\', \'tanh\', \'relu\'],\n    \'learning_rate\' :[\'constant\', \'invscaling\', \'adaptive\'],\n}\ngrid_mlpr = GridSearchCV(model_mlpr, grid_params_mlpr, cv=5, verbose=1, scoring="recall")\ngrid_mlpr.fit(X_train_res, y_train_res)\n#'

In [52]:
grid_mlpr = MLPClassifier(activation='relu', learning_rate='adaptive')
grid_mlpr.fit(X_train_res, y_train_res)

In [53]:
y_pred_mlpr = grid_mlpr.predict(X_test)

In [54]:
print(classification_report(y_test, y_pred_mlpr))

              precision    recall  f1-score   support

           0       0.96      0.90      0.93       943
           1       0.11      0.28      0.16        43

    accuracy                           0.87       986
   macro avg       0.54      0.59      0.54       986
weighted avg       0.93      0.87      0.90       986



In [55]:
cf_matrix_mlpr = confusion_matrix(y_test, y_pred_mlpr)
fig = px.imshow(
    cf_matrix_mlpr,
    text_auto=True,
    labels=dict(x="Predicted Values", y="Actual Values"),
    x=['No Stroke', 'Stroke'],
    y=['No Stroke', 'Stroke'],
    color_continuous_scale='mint'
)
fig.update_layout(
    title="MLP Confusion Matrix",
    title_x = 0.5,
    font_size = 18,
    coloraxis_colorbar_x=1.1,
    width = 500,
    height = 400
)
#fig.write_html("cf_matrix_mlp.html")
fig.show()

# Conclusion

From exploratory data analysis, it was observed that the features that had the highest impact on having a stroke was age, average glucose level, and being married. The proportion of individuals that have a stroke increases greatly after the age of 40. Individuals with higher average glucose levels and individuals that are married both have an increased risk of having a stroke.\
Regarding model predictions, it was difficult to improve the accuracy past the null accuracy for this dataset. This dataset was relatively small (5000 samples) with a highly imbalanced target distribution (about 5% minority class). Since models perform only as well as the quality of the data, the main improvement for future experiments in the topic of stroke predictions would be to increase the quantity and diversity of data. Specifically, more data would be helpful and more data with positive stroke labels to balance this dataset would be helpful.\
The applicability of our models depend on the use case for stroke diagnoses. If one wants to use a model with extremely high accuracy, a null predictor (predicting everyone to not have a stroke) will be correct 96% of the time. However, if one wants to capture the population that will have a stroke while not caring about false positives, the logistic regression model performed best with a 0.98 recall in the has stroke category. 

