In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

### Data Preprocessing

In [2]:
df = pd.read_csv("brain_stroke.csv")

In [3]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [4]:
gender_dummies = df['gender'].map({'Male': 1, 'Female': 0})
ever_married_dummies = df['ever_married'].map({'Yes': 1, 'No': 0})
work_type_dummies = pd.get_dummies(df['work_type'])
Residence_type_dummies = pd.get_dummies(df['Residence_type'])
smoking_status_dummies = pd.get_dummies(df['smoking_status'])

df = pd.concat(
    [
        df.drop(
            columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
        ),
        gender_dummies, ever_married_dummies, work_type_dummies, 
        Residence_type_dummies, smoking_status_dummies
    ]
    , 
    axis =1
)


In [5]:
df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender,ever_married,Govt_job,Private,Self-employed,children,Rural,Urban,Unknown,formerly smoked,never smoked,smokes
0,67.0,0,1,228.69,36.6,1,1,1,False,True,False,False,False,True,False,True,False,False
1,80.0,0,1,105.92,32.5,1,1,1,False,True,False,False,True,False,False,False,True,False
2,49.0,0,0,171.23,34.4,1,0,1,False,True,False,False,False,True,False,False,False,True
3,79.0,1,0,174.12,24.0,1,0,1,False,False,True,False,True,False,False,False,True,False
4,81.0,0,0,186.21,29.0,1,1,1,False,True,False,False,False,True,False,True,False,False


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                4981 non-null   float64
 1   hypertension       4981 non-null   int64  
 2   heart_disease      4981 non-null   int64  
 3   avg_glucose_level  4981 non-null   float64
 4   bmi                4981 non-null   float64
 5   stroke             4981 non-null   int64  
 6   gender             4981 non-null   int64  
 7   ever_married       4981 non-null   int64  
 8   Govt_job           4981 non-null   bool   
 9   Private            4981 non-null   bool   
 10  Self-employed      4981 non-null   bool   
 11  children           4981 non-null   bool   
 12  Rural              4981 non-null   bool   
 13  Urban              4981 non-null   bool   
 14  Unknown            4981 non-null   bool   
 15  formerly smoked    4981 non-null   bool   
 16  never smoked       4981 

In [7]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender,ever_married
count,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0
mean,43.419859,0.096165,0.05521,105.943562,28.498173,0.049789,0.416382,0.658502
std,22.662755,0.294848,0.228412,45.075373,6.790464,0.217531,0.493008,0.47426
min,0.08,0.0,0.0,55.12,14.0,0.0,0.0,0.0
25%,25.0,0.0,0.0,77.23,23.7,0.0,0.0,0.0
50%,45.0,0.0,0.0,91.85,28.1,0.0,0.0,1.0
75%,61.0,0.0,0.0,113.86,32.6,0.0,1.0,1.0
max,82.0,1.0,1.0,271.74,48.9,1.0,1.0,1.0


In [8]:
df.isnull().sum()

age                  0
hypertension         0
heart_disease        0
avg_glucose_level    0
bmi                  0
stroke               0
gender               0
ever_married         0
Govt_job             0
Private              0
Self-employed        0
children             0
Rural                0
Urban                0
Unknown              0
formerly smoked      0
never smoked         0
smokes               0
dtype: int64

### Train Test Splitting

In [9]:
x = df.drop('stroke',axis=1)
y = df['stroke']

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

### Model Training

In [11]:
model = DecisionTreeClassifier()
model.fit(x_train,y_train)

In [12]:
y_pred = model.predict(x_test)

In [13]:
y_pred

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,

### Evaluating the model

In [14]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       943
           1       0.26      0.26      0.26        54

    accuracy                           0.92       997
   macro avg       0.61      0.61      0.61       997
weighted avg       0.92      0.92      0.92       997

