In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import plotly.express as px

In [4]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df.head()               

FileNotFoundError: [Errno 2] No such file or directory: 'healthcare-dataset-stroke-data.csv'

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.dtypes

Since, age is in float data type, we will conert it to int 

In [None]:
df['age']=pd.to_numeric(df['age'])

In [None]:
df['age']=df['age'].astype(int)

In [None]:
df.dtypes

Thus, we find out that there is null data but no erroneous data, so lets recheck for null values and proceed accordingly.

In [None]:
#preprocessing the data

#checking for null values as no of rows for bmi feature are less
df.isnull().sum()

In [None]:
#as there are null values, we need to fill in those so lets see its relation with gender and bmi

sns.boxplot(x=df['gender'], y= df['bmi'])

In [None]:
#lets check age column now

sns.scatterplot(data = df, x='bmi', y='age')

So neither gender alone nor age alone can determine bmi. 

We can group age into toddler ,teen, adult and senior and then impute the bmi values based on the median.

In [None]:
age_grp=[]

for i in df['age']:
    if i<2:
         age_grp.append('Toddler')
    elif i>2 and i<=19:
        age_grp.append('Teen')
    elif i>19 and i<60:
        age_grp.append('Adult')
    else:
        age_grp.append('Senior')
df['age_group']=age_grp
df.head()
   

In [None]:
fig = px.box(df, x="age_group", y="bmi", color = "gender")
fig.show()

In [None]:
def impute_bmi(cols):
    bmi = cols[0]
    age_group = cols[1]
    gender = cols[2]
    if pd.isnull(bmi):
        if age_group == 'Senior':
            if gender == 'Male':
                return 29.0
            else:
                return 28.9
        elif age_group == 'Adult':
            if gender == 'Male':
                return 30.2
            else:
                return 28.8
        elif age_group == 'Teen':
            if gender == 'Male':
                return 20.6
            else:
                return 21.8
        else:
            if gender == 'Male':
                return 18.9
            else:
                return 18.05
    else:
        return bmi
df['bmi']=df[['bmi','age_group','gender']].apply(impute_bmi,axis=1)
df.head()

In [None]:
df.isnull().sum()

Null values are thus removed and now we can proceed further

# EDA part begins

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

To check if there are duplicate values or not

In [None]:
df.duplicated().sum()

storing categorical and continuous columns into separate list for future purpose

In [None]:
con_cols = ['age','avg_glucose_level','bmi']
cat_cols=[]

for i in df.columns:
    if i!= 'id' and i!='stroke' and i not in con_cols:
        cat_cols.append(i)
cat_cols

checking value in each categorical column

In [None]:
# cat_cols.append('age_group')
for i in cat_cols:
    print(f'~~~~~~~~~~~~~~~~Values in {i}~~~~~~~~~~~~\n{df[i].value_counts()}\n-----------------------------------------')

As only 1 instance of gender 'other' is found, so for better accuracy, lets remove that

In [None]:
df.drop(index = df[df['gender']=='Other'].index[0] ,axis=0,inplace=True)

Next, lets see the distribution of data for stroke and not stroke 

In [None]:
sns.countplot(data=df,x='stroke')
plt.show()

There is a huge imbalance in the dataset, so we need to oversample the 'stroke' class. We will do that after encoding. Next is the heatmap of various columns

In [None]:
plt.figure(figsize=(6,6))
sns.heatmap(df.corr(),annot=True,linewidth=0.5,fmt='0.2f')
plt.show()

# DATA PREPROCESSING

Now, we will proceed with the encoding of categorical columns, using pd.get_dummies method.

Else, no need to encode the labels as they are already either 0 or 1

In [None]:
cat_cols.pop()
df_enc=pd.get_dummies(df[cat_cols],drop_first=True)
df_enc.head()

In [None]:
df[df_enc.columns]=df_enc

In [None]:
df.head()

In [None]:
df.drop(cat_cols,axis=1,inplace=True)
df.head()

In [None]:
df.head()

In [None]:
df_final = df.drop(['id','age_group'], axis=1)
df_final.head()

OVERSAMPLING OF MINORITY CLASS 

In [None]:
!pip install imblearn

In [None]:
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy='minority')

X= df_final.drop(['stroke'],axis=1)

y = df_final['stroke']

X_over , y_over = oversample.fit_resample(X,y)

SPLITTING INTO TRAIN AND TEST DATA

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X_over,y_over,test_size=0.3,stratify=y_over,random_state=42)

SCALING OF NUMERICAL COLUMNS

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train[con_cols]=scaler.fit_transform(X_train[con_cols])
X_test[con_cols]=scaler.transform(X_test[con_cols])

In [None]:
X_train.head()

# MODEL TRAINING

k NEAREST NEIGHBOURS 

First lets find the perfect value of k for our model for best accuracy

In [None]:
pip install --upgrade sklearn

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.model_selection import cross_val_score
knn_scores=[]

for k in range(1,20):
    knn=KNeighborsClassifier(n_neighbors=k)
    scores=cross_val_score(knn,X_train,y_train,cv=5)
    knn_scores.append(scores.mean())
    
x_ticks =np.arange(1,20)
x_labels=x_ticks

plt.plot([k for k in range(1,20)],knn_scores)
plt.xticks(ticks=x_ticks, labels = x_labels)
plt.grid()

SO, KEEPING NEIGHBORS = 2 IS THE BEST OPTION

In [None]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,y_train)

from sklearn.metrics import confusion_matrix
confusion_knn = confusion_matrix(y_test,knn.predict(X_test))
print(confusion_knn)

plt.figure(figsize = (8,8))
sns.heatmap(confusion_knn,annot=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")

from sklearn.metrics import classification_report
print(classification_report(y_test,knn.predict(X_test)))

Model training using neural networks

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [None]:
from tensorflow.keras.utils import to_categorical

In [None]:
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)

In [None]:
model = Sequential()

# Add the first hidden layer with 6 nodes. Input_dim refers to the number of columns/number of features in x_values or the input layer.
# Activation refers to how the nodes/neurons are activated. We will use relu. Other common activations are 'sigmoid' and 'tanh'
model.add(Dense(6,input_dim=13,activation='relu'))

# Add the hidden layer with 6 nodes. 
model.add(Dense(6,activation='relu'))

# Add the output layer with 3 nodes. The activation used has to be 'sigmoid'. 
model.add(Dense(2,activation='sigmoid'))

# Compile the model together. The optimizer refers to the method to make the adjustment within the model. Loss refers to how the difference between the predicted out 
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(X_train,y_train,epochs=20,shuffle=True, batch_size=1)