In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, recall_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.neighbors import KNeighborsClassifier

In [2]:
strokeData = pd.read_csv("..\..\data\cleaned\cleanedDataset.csv")
X = strokeData.drop(columns='stroke', axis=1)
y = strokeData['stroke']

In [3]:
def knn_ROS70():
    # splitting 70-30
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # oversampling minority class
    oversample = RandomOverSampler(sampling_strategy='minority')
    X_oversample, y_oversample = oversample.fit_resample(X_train, y_train)
    # checking oversampling counts
    print(y_oversample.value_counts())
    # checking testing counts
    print(y_test.value_counts())

    # create Classifier for K-Nearest Neighbours
    knn = KNeighborsClassifier(n_neighbors=60)

    # fit kNN into oversampled training set
    knn.fit(X_oversample, y_oversample)

    # predict labels of oversampled test dataset
    y_pred = knn.predict(X_test)

    # compute and print oversampled confusion matrix
    print(confusion_matrix(y_test, y_pred))
    # compute and print oversampled classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'K-Nearest Neighbours Oversampling Recall Rate: {recall*100:.2f}%')

In [4]:
knn_ROS70()

1    3403
0    3403
Name: stroke, dtype: int64
0    1458
1      75
Name: stroke, dtype: int64
[[990 468]
 [ 18  57]]
              precision    recall  f1-score   support

    noStroke       0.98      0.68      0.80      1458
      stroke       0.11      0.76      0.19        75

    accuracy                           0.68      1533
   macro avg       0.55      0.72      0.50      1533
weighted avg       0.94      0.68      0.77      1533

0.76
K-Nearest Neighbours Oversampling Recall Rate: 76.00%


In [5]:
def knn_ROS80():
    # splitting 80-20
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # oversampling minority class
    oversample = RandomOverSampler(sampling_strategy='minority')
    X_oversample, y_oversample = oversample.fit_resample(X_train, y_train)
    # checking oversampling counts
    print(y_oversample.value_counts())
    # checking testing counts
    print(y_test.value_counts())

    # create Classifier for K-Nearest Neighbours
    knn = KNeighborsClassifier(n_neighbors=60)

    # fit kNN into oversampled training set
    knn.fit(X_oversample, y_oversample)

    # predict labels of oversampled test dataset
    y_pred = knn.predict(X_test)

    # compute and print oversampled confusion matrix
    print(confusion_matrix(y_test, y_pred))
    # compute and print oversampled classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'K-Nearest Neighbours Oversampling Recall Rate: {recall*100:.2f}%')

In [6]:
knn_ROS80()

1    3889
0    3889
Name: stroke, dtype: int64
0    972
1     50
Name: stroke, dtype: int64
[[672 300]
 [ 12  38]]
              precision    recall  f1-score   support

    noStroke       0.98      0.69      0.81       972
      stroke       0.11      0.76      0.20        50

    accuracy                           0.69      1022
   macro avg       0.55      0.73      0.50      1022
weighted avg       0.94      0.69      0.78      1022

0.76
K-Nearest Neighbours Oversampling Recall Rate: 76.00%


In [7]:
def knn_ROS90():
    # splitting 90-10
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

    # oversampling minority class
    oversample = RandomOverSampler(sampling_strategy='minority')
    X_oversample, y_oversample = oversample.fit_resample(X_train, y_train)
    # checking oversampling counts
    print(y_oversample.value_counts())
    # checking testing counts
    print(y_test.value_counts())

    # create Classifier for K-Nearest Neighbours
    knn = KNeighborsClassifier(n_neighbors=60)

    # fit kNN into oversampled training set
    knn.fit(X_oversample, y_oversample)

    # predict labels of oversampled test dataset
    y_pred = knn.predict(X_test)

    # compute and print oversampled confusion matrix
    print(confusion_matrix(y_test, y_pred))
    # compute and print oversampled classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'K-Nearest Neighbours Oversampling Recall Rate: {recall*100:.2f}%')

In [8]:
knn_ROS90()

1    4375
0    4375
Name: stroke, dtype: int64
0    486
1     25
Name: stroke, dtype: int64
[[357 129]
 [ 10  15]]
              precision    recall  f1-score   support

    noStroke       0.97      0.73      0.84       486
      stroke       0.10      0.60      0.18        25

    accuracy                           0.73       511
   macro avg       0.54      0.67      0.51       511
weighted avg       0.93      0.73      0.80       511

0.6
K-Nearest Neighbours Oversampling Recall Rate: 60.00%
