In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [39]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

In [40]:
df=pd.read_excel("/content/drive/MyDrive/CSE 4.1/AI Lab/Project/Brain stroke prediction dataset.xlsx")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [None]:
print('Num of rows =',df.shape[0])
print('Num of cols =',df.shape[1])
df.dtypes

Num of rows = 4981
Num of cols = 11


gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

*   **This dataset has 11 columns and 4981 rows**
*   **Dependent feature - stroke**

In [None]:
df.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

**There is no null values**

In [None]:
print(df['gender'].unique())
print(df['ever_married'].unique())
print(df['work_type'].unique())
print(df['Residence_type'].unique())
print(df['smoking_status'].unique())

['Male' 'Female']
['Yes' 'No']
['Private' 'Self-employed' 'Govt_job' 'children']
['Urban' 'Rural']
['formerly smoked' 'never smoked' 'smokes' 'Unknown']


* gender, work_type, Residence_type are Nominal categorical variable (One Hot Encodding/Label Encodding/Pandas's get_dummies needed)
* ever_married, smoking_status are Ordinal categorical variable
* work_type, smoking_status columns are with more than two distinct values (One Hot Encodding may be applied).
Other columns have 3 unique values (Label Encodding may be applied).

## **Handling ordinal categorical features**


In [41]:
ever_married_mapper={'Yes':1,'No':0}
smoking_status_mapper={'never smoked':0,'formerly smoked':1,'smokes':2,'Unknown':3}
df['married']=df['ever_married'].replace(ever_married_mapper)
df['smoking_status_new']=df['smoking_status'].replace(smoking_status_mapper)
df.drop(['smoking_status','ever_married'],axis=1,inplace=True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,work_type,Residence_type,avg_glucose_level,bmi,stroke,married,smoking_status_new
0,Male,67.0,0,1,Private,Urban,228.69,36.6,1,1,1
1,Male,80.0,0,1,Private,Rural,105.92,32.5,1,1,0
2,Female,49.0,0,0,Private,Urban,171.23,34.4,1,1,2
3,Female,79.0,1,0,Self-employed,Rural,174.12,24.0,1,1,0
4,Male,81.0,0,0,Private,Urban,186.21,29.0,1,1,1


## **Handling nominal categorical features**

In [42]:
# Using sklearn's LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

le.fit(df['Residence_type'])
df['residence_type_new'] = le.transform(df['Residence_type'])
le.fit(df['gender'])
df['gender_new'] = le.transform(df['gender'])
df.drop(['Residence_type','gender'],axis=1,inplace=True)
df.head()

Unnamed: 0,age,hypertension,heart_disease,work_type,avg_glucose_level,bmi,stroke,married,smoking_status_new,residence_type_new,gender_new
0,67.0,0,1,Private,228.69,36.6,1,1,1,1,1
1,80.0,0,1,Private,105.92,32.5,1,1,0,0,1
2,49.0,0,0,Private,171.23,34.4,1,1,2,1,0
3,79.0,1,0,Self-employed,174.12,24.0,1,1,0,0,0
4,81.0,0,0,Private,186.21,29.0,1,1,1,1,1


In [43]:
# Using pandas's get_dummies
work_types_dummies=pd.get_dummies(df['work_type'],drop_first=True)
final_df=pd.concat([df.drop('work_type',axis=1),work_types_dummies],axis=1)
final_df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,married,smoking_status_new,residence_type_new,gender_new,Private,Self-employed,children
0,67.0,0,1,228.69,36.6,1,1,1,1,1,1,0,0
1,80.0,0,1,105.92,32.5,1,1,0,0,1,1,0,0
2,49.0,0,0,171.23,34.4,1,1,2,1,0,1,0,0
3,79.0,1,0,174.12,24.0,1,1,0,0,0,0,1,0
4,81.0,0,0,186.21,29.0,1,1,1,1,1,1,0,0


**After LabelEncodding, get_dummies -**
1. married column:
    - Yes = 1
    - No = 0
2. gender_new column:
    - Male = 1
    - Female = 0
3. residence_type_new column:
    - Urban = 1
    - Rural = 0
4. smoking_status_new column:
    - never smoked = 0
    - formerly smoked = 1
    - smokes = 2
    - unknown = 3

In [44]:
file_path="/content/drive/MyDrive/CSE 4.1/AI Lab/Project/Brain stroke prediction dataset(Updated).csv"
final_df.to_csv(file_path, index=False)  

### **Applying Train Test Split**

In [73]:
from sklearn.model_selection import train_test_split

final_df=pd.read_csv("/content/drive/MyDrive/CSE 4.1/AI Lab/Project/Brain stroke prediction dataset(Updated).csv")
inputs = final_df.drop('stroke',axis=1)
target = final_df['stroke']

x_train,x_test,y_train,y_test=train_test_split(inputs,target,test_size=0.3)

### **Applying Logistic Regression**

In [75]:
from sklearn.linear_model import LogisticRegression
LR_model=LogisticRegression(max_iter=10000)
LR_model.fit(x_train,y_train)
LR_model.score(x_test,y_test)

0.9418060200668896

### **Applying Decision Tree Classifier**

In [76]:
from sklearn.tree import DecisionTreeClassifier

DTC_model=DecisionTreeClassifier()
DTC_model.fit(x_train,y_train)
DTC_model.score(x_test,y_test)

0.8869565217391304

### **Applying SVM**

In [77]:
from sklearn.svm import SVC
SVM_model=SVC()
SVM_model.fit(x_train,y_train)
SVM_model.score(x_test,y_test)

0.9418060200668896

### **Applying Random Forest Classifier**

In [78]:
from sklearn.ensemble import RandomForestClassifier
RFC_model=RandomForestClassifier()
RFC_model.fit(x_train,y_train)
RFC_model.score(x_test,y_test)

0.9404682274247491

### **Applying Gaussian Naive Bayes Classifier**

In [79]:
from sklearn.naive_bayes import GaussianNB
GNB_model=GaussianNB()
GNB_model.fit(x_train,y_train)
GNB_model.score(x_test,y_test)

0.5765886287625418

### **Applying K nearest neighbor Classifier**

In [80]:
from sklearn.neighbors import KNeighborsClassifier
KNC_model= KNeighborsClassifier(n_neighbors=3)
KNC_model.fit(x_train,y_train)
KNC_model.score(x_test,y_test)

0.9237458193979933

## **Conducting Cross Validation**

In [52]:
def get_score(model,x_train,x_test,y_tran,y_test):
    model.fit(x_train,y_train)
    return model.score(x_test,y_test)

In [53]:
from sklearn.model_selection import cross_val_score
cvs_lr=cross_val_score(LR_model,inputs,target,cv=3)
cvs_dtc=cross_val_score(DTC_model,inputs,target,cv=3)
cvs_svm=cross_val_score(SVM_model,inputs,target,cv=3)
cvs_rfc=cross_val_score(RFC_model,inputs,target,cv=3)
cvs_gnb=cross_val_score(GNB_model,inputs,target,cv=3)
cvs_knc=cross_val_score(KNC_model,inputs,target,cv=3)
print('Scores of Logistic Regression:',cvs_lr)
print('Scores of Decision Tree Classifier:',cvs_dtc)
print('Scores of SVM:',cvs_svm)
print('Scores of Random Forest Classifier:',cvs_rfc)
print('Scores of Gaussian Naive Bayes:',cvs_gnb)
print('Scores of KNN:',cvs_knc)

Scores of Logistic Regression: [0.94942806 0.95       0.95120482]
Scores of Decision Tree Classifier: [0.90367249 0.91084337 0.90843373]
Scores of SVM: [0.9500301  0.95       0.95060241]
Scores of Random Forest Classifier: [0.94822396 0.94759036 0.95      ]
Scores of Gaussian Naive Bayes: [0.83564118 0.83855422 0.55903614]
Scores of KNN: [0.93919326 0.93373494 0.94036145]


In [54]:
print('Average Score of Logistic Regression:',cvs_lr.sum()/len(cvs_lr))
print('Average Score of Decision Tree Classifier:',cvs_dtc.sum()/len(cvs_dtc))
print('Average Score of SVM:',cvs_svm.sum()/len(cvs_svm))
print('Average Score of Random Forest Classifier:',cvs_rfc.sum()/len(cvs_rfc))
print('Average Score of Gaussian Naive Bayes:',cvs_gnb.sum()/len(cvs_gnb))
print('Average Score of KNN:',cvs_knc.sum()/len(cvs_knc))

Average Score of Logistic Regression: 0.9502109582218096
Average Score of Decision Tree Classifier: 0.9076498649625594
Average Score of SVM: 0.9502108373288457
Average Score of Random Forest Classifier: 0.9486047743049258
Average Score of Gaussian Naive Bayes: 0.7444105138192746
Average Score of KNN: 0.9377632142054068


## **Predicting Brain Stroke depending upon user input**

In [121]:
age=int(input('Age? '))
hypertension=int(input('Hypertension(Yes:1, No: 0)? '))
heart_disease=int(input('Heart Disease(Yes:1, No: 0)? '))
avg_glucose_level=float(input('Avg glucose level? '))
bmi=float(input('BMI? '))
married=int(input('Married(Yes:1, No: 0)? '))
smoking_status_new=int(input('Smoking Status(never smoked = 0, formerly smoked = 1, smokes = 2, unknown = 3)? '))
residence_type_new=int(input('Residence Type(Urban = 1, Rural = 0)? '))
gender_new=int(input('Gender(Male = 1, Female = 0)? '))
work_type=int(input('Work Type(Private = 1, Self-employed = 2, Govt_job=3, children = 4)? '))
Private=0
Self_employed=0
children=0
if work_type==1:
  Private=1
elif work_type==2:
  Self_employed=1
elif work_type==4:
  children=1

data = [[age,hypertension,heart_disease,avg_glucose_level,bmi,married,smoking_status_new,residence_type_new,gender_new,Private,Self_employed,children]]
input_df = pd.DataFrame(data,columns=['age','hypertension','heart_disease','avg_glucose_level','bmi','married','smoking_status_new','residence_type_new','gender_new','Private','Self-employed','children'])

predicted_value=LR_model.predict(input_df)[0]
predicted_value_prob=LR_model.predict_proba(input_df)[0][0]

if predicted_value==1:
  print(f'You have brain stroke with the probability of {round(predicted_value_prob*100,3)}%')
else:
  print(f'You don\'t have brain stroke with the probability of {round(predicted_value_prob*100,3)}%')
  

Age? 23
Hypertension(Yes:1, No: 0)? 0
Heart Disease(Yes:1, No: 0)? 0
Avg glucose level? 36
BMI? 25
Married(Yes:1, No: 0)? 0
Smoking Status(never smoked = 0, formerly smoked = 1, smokes = 2, unknown = 3)? 0
Residence Type(Urban = 1, Rural = 0)? 1
Gender(Male = 1, Female = 0)? 1
Work Type(Private = 1, Self-employed = 2, Govt_job=3, children = 4)? 1
You don't have brain stroke with the probability of 99.78%


In [124]:
x_test.iloc[123]

age                   65.00
hypertension           0.00
heart_disease          0.00
avg_glucose_level     77.46
bmi                   30.90
married                1.00
smoking_status_new     1.00
residence_type_new     1.00
gender_new             0.00
Private                0.00
Self-employed          1.00
children               0.00
Name: 2531, dtype: float64

In [125]:
y_test.iloc[123]

0