In [2]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

In [12]:
df=pd.read_excel("Brain stroke prediction dataset.xlsx")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [67]:
print('Num of rows =',df.shape[0])
print('Num of cols =',df.shape[1])
df.dtypes

Num of rows = 4981
Num of cols = 11


gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

### This dataset has 11 columns and 4981 rows 
#### Dependent feature - stroke

In [68]:
df.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

#### There is no null values

In [69]:
print(df['gender'].unique())
print(df['ever_married'].unique())
print(df['work_type'].unique())
print(df['Residence_type'].unique())
print(df['smoking_status'].unique())

['Male' 'Female']
['Yes' 'No']
['Private' 'Self-employed' 'Govt_job' 'children']
['Urban' 'Rural']
['formerly smoked' 'never smoked' 'smokes' 'Unknown']


##### gender, work_type, Residence_type are Nominal categorical variable (One Hot Encodding/Label Encodding/Pandas's get_dummies needed)
##### ever_married, smoking_status are Ordinal categorical variable
 
work_type, smoking_status columns are with more than two distinct values (One Hot Encodding may be applied).
Other columns have 3 unique values (Label Encodding may be applied).

## Handling ordinal categorical features

In [13]:
ever_married_mapper={'Yes':1,'No':0}
smoking_status_mapper={'never smoked':0,'formerly smoked':1,'smokes':2,'Unknown':3}
df['married']=df['ever_married'].replace(ever_married_mapper)
df['smoking_status_new']=df['smoking_status'].replace(smoking_status_mapper)
df.drop(['smoking_status','ever_married'],axis=1,inplace=True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,work_type,Residence_type,avg_glucose_level,bmi,stroke,married,smoking_status_new
0,Male,67.0,0,1,Private,Urban,228.69,36.6,1,1,1
1,Male,80.0,0,1,Private,Rural,105.92,32.5,1,1,0
2,Female,49.0,0,0,Private,Urban,171.23,34.4,1,1,2
3,Female,79.0,1,0,Self-employed,Rural,174.12,24.0,1,1,0
4,Male,81.0,0,0,Private,Urban,186.21,29.0,1,1,1


## Handling nominal categorical features

In [14]:
# Using sklearn's LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

le.fit(df['Residence_type'])
df['residence_type_new'] = le.transform(df['Residence_type'])
le.fit(df['gender'])
df['gender_new'] = le.transform(df['gender'])
df.drop(['Residence_type','gender'],axis=1,inplace=True)
df.head()

Unnamed: 0,age,hypertension,heart_disease,work_type,avg_glucose_level,bmi,stroke,married,smoking_status_new,residence_type_new,gender_new
0,67.0,0,1,Private,228.69,36.6,1,1,1,1,1
1,80.0,0,1,Private,105.92,32.5,1,1,0,0,1
2,49.0,0,0,Private,171.23,34.4,1,1,2,1,0
3,79.0,1,0,Self-employed,174.12,24.0,1,1,0,0,0
4,81.0,0,0,Private,186.21,29.0,1,1,1,1,1


In [15]:
# Using pandas's get_dummies
work_types_dummies=pd.get_dummies(df['work_type'],drop_first=True)
final_df=pd.concat([df.drop('work_type',axis=1),work_types_dummies],axis=1)
final_df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,married,smoking_status_new,residence_type_new,gender_new,Private,Self-employed,children
0,67.0,0,1,228.69,36.6,1,1,1,1,1,1,0,0
1,80.0,0,1,105.92,32.5,1,1,0,0,1,1,0,0
2,49.0,0,0,171.23,34.4,1,1,2,1,0,1,0,0
3,79.0,1,0,174.12,24.0,1,1,0,0,0,0,1,0
4,81.0,0,0,186.21,29.0,1,1,1,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4976,41.0,0,0,70.15,29.8,0,0,1,0,1,1,0,0
4977,40.0,0,0,191.15,31.1,0,1,2,1,1,1,0,0
4978,45.0,1,0,95.02,31.8,0,1,2,0,0,0,0,0
4979,40.0,0,0,83.94,30.0,0,1,2,0,1,1,0,0


### After LabelEncodding, get_dummies -
1. married column:
    - Yes = 1
    - No = 0
2. gender_new column:
    - Male = 1
    - Female = 0
3. residence_type_new column:
    - Urban = 1
    - Rural = 0
4. smoking_status_new column:
    - never smoked = 0
    - formerly smoked = 1
    - smokes = 2
    - unknown = 3

In [16]:
inputs=final_df.drop('stroke',axis=1)
target=final_df['stroke']

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

x_train,x_test,y_train,y_test=train_test_split(inputs,target,test_size=0.2)
model=DecisionTreeClassifier()
model.fit(x_train,y_train)

DecisionTreeClassifier()

In [18]:
model.score(x_test,y_test)

0.8996990972918756