In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, recall_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.neighbors import KNeighborsClassifier

In [2]:
strokeData = pd.read_csv("..\..\data\cleaned\cleanedDataset.csv")
X = strokeData.drop(columns='stroke', axis=1)
y = strokeData['stroke']

In [4]:
def knn_RUS70():
    # splitting 70-30
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # undersampling majority class
    undersample = RandomUnderSampler(sampling_strategy='majority')
    X_undersample, y_undersample = undersample.fit_resample(X_train, y_train)
    # checking undersampled counts
    print(y_undersample.value_counts())
    # checking testing counts
    print(y_test.value_counts())

    # create Classifier for K-Nearest Neighbours
    knn = KNeighborsClassifier(n_neighbors=60)

    # fit kNN into undersampled training set
    knn.fit(X_undersample, y_undersample)

    # predict labels of undersampled test dataset
    y_pred = knn.predict(X_test)

    # compute and print undersampled confusion matrix
    print(confusion_matrix(y_test, y_pred))
    # compute and print undersampled classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'K-Nearest Neighbours Undersampling Recall Rate: {recall*100:.2f}%')

In [5]:
knn_RUS70()

1    174
0    174
Name: stroke, dtype: int64
0    1458
1      75
Name: stroke, dtype: int64
[[1003  455]
 [  18   57]]
              precision    recall  f1-score   support

    noStroke       0.98      0.69      0.81      1458
      stroke       0.11      0.76      0.19        75

    accuracy                           0.69      1533
   macro avg       0.55      0.72      0.50      1533
weighted avg       0.94      0.69      0.78      1533

0.76
K-Nearest Neighbours Undersampling Recall Rate: 76.00%


In [6]:
def knn_RUS80():
    # splitting 80-20
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # undersampling majority class
    undersample = RandomUnderSampler(sampling_strategy='majority')
    X_undersample, y_undersample = undersample.fit_resample(X_train, y_train)
    # checking undersampled counts
    print(y_undersample.value_counts())
    # checking testing counts
    print(y_test.value_counts())

    # create Classifier for K-Nearest Neighbours
    knn = KNeighborsClassifier(n_neighbors=60)

    # fit kNN into undersampled training set
    knn.fit(X_undersample, y_undersample)

    # predict labels of undersampled test dataset
    y_pred = knn.predict(X_test)

    # compute and print undersampled confusion matrix
    print(confusion_matrix(y_test, y_pred))
    # compute and print undersampled classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'K-Nearest Neighbours Undersampling Recall Rate: {recall*100:.2f}%')

In [7]:
knn_RUS80()

1    199
0    199
Name: stroke, dtype: int64
0    972
1     50
Name: stroke, dtype: int64
[[589 383]
 [  8  42]]
              precision    recall  f1-score   support

    noStroke       0.99      0.61      0.75       972
      stroke       0.10      0.84      0.18        50

    accuracy                           0.62      1022
   macro avg       0.54      0.72      0.46      1022
weighted avg       0.94      0.62      0.72      1022

0.84
K-Nearest Neighbours Undersampling Recall Rate: 84.00%


In [8]:
def knn_RUS90():
    # splitting 90-10
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

    # undersampling majority class
    undersample = RandomUnderSampler(sampling_strategy='majority')
    X_undersample, y_undersample = undersample.fit_resample(X_train, y_train)
    # checking undersampled counts
    print(y_undersample.value_counts())
    # checking testing counts
    print(y_test.value_counts())

    # create Classifier for K-Nearest Neighbours
    knn = KNeighborsClassifier(n_neighbors=60)

    # fit kNN into undersampled training set
    knn.fit(X_undersample, y_undersample)

    # predict labels of undersampled test dataset
    y_pred = knn.predict(X_test)

    # compute and print undersampled confusion matrix
    print(confusion_matrix(y_test, y_pred))
    # compute and print undersampled classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'K-Nearest Neighbours Undersampling Recall Rate: {recall*100:.2f}%')

In [9]:
knn_RUS90()

1    224
0    224
Name: stroke, dtype: int64
0    486
1     25
Name: stroke, dtype: int64
[[352 134]
 [  8  17]]
              precision    recall  f1-score   support

    noStroke       0.98      0.72      0.83       486
      stroke       0.11      0.68      0.19        25

    accuracy                           0.72       511
   macro avg       0.55      0.70      0.51       511
weighted avg       0.94      0.72      0.80       511

0.68
K-Nearest Neighbours Undersampling Recall Rate: 68.00%
