### STROKE PREDICTION:

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv


In [2]:
stroke_data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
stroke_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


#### Exploratory Data Analysis:

In [3]:
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
stroke_data.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [5]:
na_values = stroke_data.isna().sum()
na_values.sort_values()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
smoking_status         0
stroke                 0
bmi                  201
dtype: int64

#### Data Preprocessing:

In [6]:
# remove na values from the bmi column
stroke_data = stroke_data.dropna(subset = ['bmi'])

numeric_cols = stroke_data.select_dtypes(exclude = "object").columns.tolist()
categoric_cols = stroke_data.select_dtypes(include = "object").columns.tolist()

stroke_data[categoric_cols] = stroke_data[categoric_cols].apply(LabelEncoder().fit_transform)

In [7]:
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,1,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,2,1
5,56669,1,81.0,0,0,1,2,1,186.21,29.0,1,1


In [8]:
X = stroke_data.drop(columns =['stroke'])
y = stroke_data['stroke']

In [9]:
from sklearn.model_selection import train_test_split


# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3289, 11) (1620, 11) (3289,) (1620,)


##### Training a logistic regression model

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
seed = 42

transform = MinMaxScaler()
X_scaled = transform.fit_transform(X)

kfold = model_selection.KFold(n_splits=5, random_state=seed, shuffle=True)
model = LogisticRegression()
results = model_selection.cross_val_score(model, X_scaled, y, cv=kfold, scoring='roc_auc')
results.mean()

0.8426564297624685

##### Training a decision tree classifier

In [11]:
from sklearn.tree import DecisionTreeClassifier
kfold = model_selection.KFold(n_splits=5, random_state=seed, shuffle=True)
model = DecisionTreeClassifier()
results = model_selection.cross_val_score(model, X_scaled, y, cv=kfold, scoring='roc_auc')
results.mean()

0.540940952624928

#### Training a RandomForest Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier
kfold = model_selection.KFold(n_splits=5, random_state=seed, shuffle=True)
model = RandomForestClassifier()
results = model_selection.cross_val_score(model, X_scaled, y, cv=kfold, scoring='roc_auc')
results.mean()

0.7880488435553914