In [30]:
import pandas as pd
from pandas.core.frame import DataFrame
import numpy as np
import seaborn as sns

filename = 'healthcare-dataset-stroke-data.csv'
cols = ['gender','age','hypertension','heart_disease','ever_married', 'work_type', 'Residence_type','avg_glucose_level','bmi', 'smoking_status','stroke']

df = pd.read_csv(filename, usecols=cols)

In [31]:
# Check datatype each column of dataframe
cat_df = df.select_dtypes(include=['object'])
num_df = df.select_dtypes(exclude=['object'])

def printColumnTypes(non_numeric_df: DataFrame, numeric_df: DataFrame):
    '''separates non-numeric and numeric columns'''
    print("Non-Numeric columns:")
    for col in non_numeric_df:
        print(f"{col}")
    print("")
    print("Numeric columns:")
    for col in numeric_df:
        print(f"{col}")

printColumnTypes(cat_df, num_df)

df.info()

Non-Numeric columns:
gender
ever_married
work_type
Residence_type
smoking_status

Numeric columns:
age
hypertension
heart_disease
avg_glucose_level
bmi
stroke
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


In [32]:
def printUniqueValue(cols: list):
    for col in cols:
        print(f"{col}: {df[col].unique()}")
        
non_numeric_col = ['gender','ever_married','work_type','Residence_type', 'smoking_status']

printUniqueValue(non_numeric_col)

df[df['smoking_status'] == 'Unknown']

gender: ['Male' 'Female' 'Other']
ever_married: ['Yes' 'No']
work_type: ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
Residence_type: ['Urban' 'Rural']
smoking_status: ['formerly smoked' 'never smoked' 'smokes' 'Unknown']


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
8,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
9,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1
13,Male,78.0,0,1,Yes,Private,Urban,219.84,,Unknown,1
19,Male,57.0,0,1,No,Govt_job,Urban,217.08,,Unknown,1
23,Male,82.0,0,1,Yes,Private,Rural,208.30,32.5,Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...
5098,Male,9.0,0,0,No,children,Urban,71.88,17.5,Unknown,0
5101,Female,45.0,0,0,Yes,Private,Urban,97.95,24.5,Unknown,0
5103,Female,18.0,0,0,No,Private,Urban,82.85,46.9,Unknown,0
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0


In [33]:
# Data cleansing
num_gender = { 'Female': 0, 'Male': 1 }
num_ever_married = { 'No': 0, 'Yes': 1 }
num_smoking_status = {
    'formerly smoked': 0,
    'never smoked': 1,
    'smokes': 2,
    'Unknown': 3,
}
num_work_type = {
    'children': 0,
    'Govt_job': 1,
    'Never_worked': 2,
    'Private': 3,
    'Self-employed': 4,
}
num_residence_type = {
    'Urban': 0,
    'Rural': 1
}

# Remove Other value in gender column
df = df[df['gender'] != 'Other']

# Label Encoding
df['gender'] = df['gender'].replace(num_gender)
df['ever_married'] = df['ever_married'].replace(num_ever_married)
df['Residence_type'] = df['Residence_type'].replace(num_residence_type)
df['smoking_status'] = df['smoking_status'].replace(num_smoking_status)
df['work_type'] = df['work_type'].replace(num_work_type)

mean_bmi_replacement_value = df.loc[:,'bmi'].dropna().mean()

df['bmi'] = df.loc[:, 'bmi'].fillna(mean_bmi_replacement_value)

In [34]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,3,0,228.69,36.6,0,1
1,0,61.0,0,0,1,4,1,202.21,28.89456,1,1
2,1,80.0,0,1,1,3,1,105.92,32.5,1,1
3,0,49.0,0,0,1,3,0,171.23,34.4,2,1
4,0,79.0,1,0,1,4,1,174.12,24.0,1,1


In [35]:

# sns.heatmap(data=df.corr(), annot=True)

In [36]:
print(df['stroke'].value_counts() / len(df))

0    0.951262
1    0.048738
Name: stroke, dtype: float64


In [37]:
# x = np.array(df.loc[:, df.columns != 'stroke'])
# y = np.array(df.loc[:, df.columns == 'stroke']).reshape(-1, 1)

X = df.iloc[:, :-1].values
y = df.iloc[:, 10].values

In [38]:
df.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=5)

In [40]:
from sklearn.neighbors import KNeighborsClassifier
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, KFold

# oversample = SMOTE()

# Resampling 
# X_train,y_train = oversample.fit_resample(X_train,y_train.ravel())

pipeline = Pipeline(steps = [['smote', SMOTE(random_state=11)],
                                ['classifier', KNeighborsClassifier(weights='distance',metric='euclidean')]])

stratified_kfold = StratifiedKFold(n_splits=10,
                                       shuffle=True,
                                       random_state=11)

param_grid = {'classifier__n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19]}
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=stratified_kfold,
                           n_jobs=-1)


grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
grid_search_df = pd.DataFrame(grid_search.cv_results_)

print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
grid_search_df

Cross-validation score: 0.8204747820916076
Test score: 0.8095238095238095


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.034143,0.005745,0.010277,0.003986,3,{'classifier__n_neighbors': 3},0.840782,0.787709,0.824022,0.801676,0.821229,0.829609,0.826331,0.815126,0.809524,0.848739,0.820475,0.017081,1
1,0.032943,0.008886,0.009443,0.003035,5,{'classifier__n_neighbors': 5},0.821229,0.77095,0.790503,0.77095,0.793296,0.824022,0.803922,0.823529,0.803922,0.829132,0.803145,0.020457,2
2,0.031465,0.007099,0.008688,0.000838,7,{'classifier__n_neighbors': 7},0.818436,0.76257,0.787709,0.75419,0.787709,0.812849,0.792717,0.809524,0.798319,0.817927,0.794195,0.021066,3
3,0.030867,0.006579,0.011784,0.003839,9,{'classifier__n_neighbors': 9},0.807263,0.74581,0.787709,0.751397,0.782123,0.801676,0.761905,0.795518,0.787115,0.806723,0.782724,0.021267,4
4,0.031369,0.008429,0.013462,0.00549,11,{'classifier__n_neighbors': 11},0.812849,0.75419,0.77933,0.740223,0.76257,0.798883,0.759104,0.787115,0.778711,0.798319,0.777129,0.021749,5
5,0.030158,0.00618,0.012898,0.00559,13,{'classifier__n_neighbors': 13},0.793296,0.75419,0.76257,0.723464,0.756983,0.793296,0.764706,0.781513,0.770308,0.798319,0.769864,0.021652,6
6,0.030859,0.004707,0.011089,0.00208,15,{'classifier__n_neighbors': 15},0.784916,0.748603,0.75419,0.712291,0.75419,0.782123,0.756303,0.77591,0.759104,0.806723,0.763435,0.024324,7
7,0.030991,0.004372,0.012606,0.002456,17,{'classifier__n_neighbors': 17},0.782123,0.734637,0.751397,0.712291,0.751397,0.782123,0.7507,0.764706,0.742297,0.80112,0.757279,0.024668,8
8,0.026587,0.005297,0.010032,0.002375,19,{'classifier__n_neighbors': 19},0.768156,0.72905,0.74581,0.703911,0.740223,0.776536,0.745098,0.764706,0.742297,0.789916,0.75057,0.023664,9


In [41]:


model = KNeighborsClassifier(weights='distance',metric='euclidean')

# param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15]}

# # Define grid search
# grid_search = GridSearchCV(model, param_grid=param_grid, cv=10)

# # Fit grid search to training data
# grid_search.fit(X_train, y_train)

# # Get best estimator and evaluate on test data
# best_knn = grid_search.best_estimator_
# y_pred = best_knn.predict(X_test)

# confusion_matrix(y_test,y_pred)

# accuracy = best_knn.score(X_test, y_test)

# print("Accuracy:", accuracy)
# print("best knn:", best_knn)

# grid_search_df = pd.DataFrame(grid_search.cv_results_)
# grid_search_df

In [42]:
# Features Importants
# from matplotlib import pyplot

# importance = model.feat
# # summarize feature importance
# for i,v in enumerate(importance):
#  print('Feature: %0d, Score: %.5f' % (i,v))
# # plot feature importance
# pyplot.bar([x for x in range(len(importance))], importance)
# pyplot.show()

In [43]:
y_pred = model.predict(X_test)
X_test[0]

NotFittedError: This KNeighborsClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
from sklearn import metrics

print("Accuracy:", metrics.accuracy_score(y_test,y_pred))

Accuracy: 0.821917808219178
