In [31]:
import pandas as pd
from pandas.core.frame import DataFrame
import numpy as np
import seaborn as sns

filename = 'healthcare-dataset-stroke-data.csv'
cols = ['gender','age','hypertension','heart_disease','ever_married', 'work_type', 'Residence_type','avg_glucose_level','bmi', 'smoking_status','stroke']

df = pd.read_csv(filename, usecols=cols)

In [32]:
# Check datatype each column of dataframe
cat_df = df.select_dtypes(include=['object'])
num_df = df.select_dtypes(exclude=['object'])

def printColumnTypes(non_numeric_df: DataFrame, numeric_df: DataFrame):
    '''separates non-numeric and numeric columns'''
    print("Non-Numeric columns:")
    for col in non_numeric_df:
        print(f"{col}")
    print("")
    print("Numeric columns:")
    for col in numeric_df:
        print(f"{col}")

printColumnTypes(cat_df, num_df)

df.info()

Non-Numeric columns:
gender
ever_married
work_type
Residence_type
smoking_status

Numeric columns:
age
hypertension
heart_disease
avg_glucose_level
bmi
stroke
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


In [33]:
def printUniqueValue(cols: list):
    for col in cols:
        print(f"{col}: {df[col].unique()}")
        
non_numeric_col = ['gender','ever_married','work_type','Residence_type', 'smoking_status']

printUniqueValue(non_numeric_col)

gender: ['Male' 'Female' 'Other']
ever_married: ['Yes' 'No']
work_type: ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
Residence_type: ['Urban' 'Rural']
smoking_status: ['formerly smoked' 'never smoked' 'smokes' 'Unknown']


In [34]:
# Data cleansing
num_gender = { 'Female': 0, 'Male': 1 }
num_ever_married = { 'No': 0, 'Yes': 1 }
num_smoking_status = {
    'formerly smoked': 0,
    'never smoked': 1,
    'smokes': 2,
    'Unknown': 3,
}
num_work_type = {
    'children': 0,
    'Govt_job': 1,
    'Never_worked': 2,
    'Private': 3,
    'Self-employed': 4,
}
num_residence_type = {
    'Urban': 0,
    'Rural': 1
}

# Remove Other value in gender column
df = df[df['gender'] != 'Other']

# Label Encoding
df['gender'] = df['gender'].replace(num_gender)
df['ever_married'] = df['ever_married'].replace(num_ever_married)
df['Residence_type'] = df['Residence_type'].replace(num_residence_type)
df['smoking_status'] = df['smoking_status'].replace(num_smoking_status)
df['work_type'] = df['work_type'].replace(num_work_type)

mean_bmi_replacement_value = df.loc[:,'bmi'].dropna().mean()

df['bmi'] = df.loc[:, 'bmi'].fillna(mean_bmi_replacement_value)

In [35]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,3,0,228.69,36.6,0,1
1,0,61.0,0,0,1,4,1,202.21,28.89456,1,1
2,1,80.0,0,1,1,3,1,105.92,32.5,1,1
3,0,49.0,0,0,1,3,0,171.23,34.4,2,1
4,0,79.0,1,0,1,4,1,174.12,24.0,1,1


In [36]:
df.corr()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
gender,1.0,-0.027752,0.021223,0.085685,-0.030171,-0.070797,0.006105,0.054722,-0.025605,0.038918,0.009081
age,-0.027752,1.0,0.276367,0.263777,0.679084,0.537862,-0.014031,0.238323,0.325861,-0.376161,0.245239
hypertension,0.021223,0.276367,1.0,0.108292,0.164187,0.128673,0.00798,0.17454,0.160151,-0.129133,0.127891
heart_disease,0.085685,0.263777,0.108292,1.0,0.114601,0.09838,-0.003045,0.161907,0.038865,-0.064753,0.134905
ever_married,-0.030171,0.679084,0.164187,0.114601,1.0,0.425788,-0.005988,0.155329,0.335564,-0.304206,0.108299
work_type,-0.070797,0.537862,0.128673,0.09838,0.425788,1.0,0.00308,0.093289,0.335569,-0.338444,0.084358
Residence_type,0.006105,-0.014031,0.00798,-0.003045,-0.005988,0.00308,1.0,0.004783,0.000288,-0.004369,-0.015415
avg_glucose_level,0.054722,0.238323,0.17454,0.161907,0.155329,0.093289,0.004783,1.0,0.168913,-0.097525,0.131991
bmi,-0.025605,0.325861,0.160151,0.038865,0.335564,0.335569,0.000288,0.168913,1.0,-0.230856,0.038917
smoking_status,0.038918,-0.376161,-0.129133,-0.064753,-0.304206,-0.338444,-0.004369,-0.097525,-0.230856,1.0,-0.066471


In [38]:
print(df['stroke'].value_counts() / len(df))

0    0.951262
1    0.048738
Name: stroke, dtype: float64


In [39]:
# x = np.array(df.loc[:, df.columns != 'stroke'])
# y = np.array(df.loc[:, df.columns == 'stroke']).reshape(-1, 1)

X = df.iloc[:, :-1].values
y = df.iloc[:, 10].values

In [40]:
df.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [41]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
mms.fit(X)

mms_np = mms.transform(X)

In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(mms_np,y, test_size=0.3, random_state=5)

In [43]:
from sklearn.neighbors import KNeighborsClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, StratifiedKFold,  KFold

pipeline = Pipeline(steps = [['smote', SMOTE(random_state=11)],
                                ['scaler', MinMaxScaler()],
                                ['classifier', KNeighborsClassifier(weights='distance',metric='euclidean')]])

stratified_kfold = KFold(n_splits=10,
                                       shuffle=True,
                                       random_state=11)

param_grid = {'classifier__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]}
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=stratified_kfold,
                           n_jobs=-1)


grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
grid_search_df = pd.DataFrame(grid_search.cv_results_)

print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
grid_search_df

Cross-validation score: 0.8845171588188349
Test score: 0.8773646444879322


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.099055,0.016517,0.092705,0.026294,1,{'classifier__n_neighbors': 1},0.891061,0.874302,0.879888,0.871508,0.877095,0.879888,0.901961,0.901961,0.879552,0.887955,0.884517,0.010293,1
1,0.095223,0.039137,0.092278,0.032442,3,{'classifier__n_neighbors': 3},0.851955,0.840782,0.851955,0.832402,0.860335,0.849162,0.845938,0.854342,0.843137,0.831933,0.846194,0.008797,2
2,0.085826,0.035188,0.086635,0.027008,5,{'classifier__n_neighbors': 5},0.829609,0.835196,0.815642,0.796089,0.832402,0.824022,0.817927,0.817927,0.823529,0.806723,0.819907,0.011319,3
3,0.085741,0.019202,0.088842,0.031487,7,{'classifier__n_neighbors': 7},0.818436,0.815642,0.810056,0.790503,0.812849,0.801676,0.795518,0.809524,0.812325,0.795518,0.806205,0.009174,4
4,0.086929,0.025063,0.089327,0.0097,9,{'classifier__n_neighbors': 9},0.804469,0.807263,0.793296,0.793296,0.812849,0.787709,0.778711,0.789916,0.80112,0.795518,0.796415,0.009625,5
5,0.073794,0.013969,0.078821,0.010969,11,{'classifier__n_neighbors': 11},0.784916,0.784916,0.782123,0.784916,0.796089,0.787709,0.773109,0.761905,0.792717,0.789916,0.783832,0.009399,6
6,0.070714,0.012623,0.09211,0.015021,13,{'classifier__n_neighbors': 13},0.773743,0.776536,0.76257,0.77933,0.793296,0.773743,0.773109,0.761905,0.778711,0.781513,0.775446,0.008619,7
7,0.078333,0.012411,0.086698,0.006629,15,{'classifier__n_neighbors': 15},0.76257,0.782123,0.759777,0.776536,0.787709,0.76257,0.767507,0.7507,0.764706,0.773109,0.768731,0.010576,8
8,0.066077,0.009545,0.094286,0.0096,17,{'classifier__n_neighbors': 17},0.75419,0.77095,0.73743,0.77095,0.787709,0.75419,0.756303,0.747899,0.761905,0.764706,0.760623,0.013291,9
9,0.070751,0.013866,0.091636,0.013102,19,{'classifier__n_neighbors': 19},0.74581,0.759777,0.734637,0.756983,0.77095,0.74581,0.7507,0.736695,0.753501,0.764706,0.751957,0.011055,10


In [44]:


model = KNeighborsClassifier(weights='distance',metric='euclidean')
