In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, recall_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.neighbors import KNeighborsClassifier

In [2]:
strokeData = pd.read_csv("..\..\data\cleaned\cleanedDataset.csv")
X = strokeData.drop(columns='stroke', axis=1)
y = strokeData['stroke']

In [3]:
def knn_SMOTE70():
    # splitting 70-30
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # oversampling minority class
    oversample = SMOTE()
    X_oversample, y_oversample = oversample.fit_resample(X_train, y_train)
    # checking oversampling counts
    print(y_oversample.value_counts())
    # checking testing counts
    print(y_test.value_counts())

    # create Classifier for K-Nearest Neighbours
    knn = KNeighborsClassifier(n_neighbors=60)

    # fit kNN into oversampled training set
    knn.fit(X_oversample, y_oversample)

    # predict labels of oversampled test dataset
    y_pred = knn.predict(X_test)

    # compute and print oversampled confusion matrix
    print(confusion_matrix(y_test, y_pred))
    # compute and print oversampled classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'K-Nearest Neighbours Oversampling Recall Rate: {recall*100:.2f}%')

In [4]:
knn_SMOTE70()

1    3403
0    3403
Name: stroke, dtype: int64
0    1458
1      75
Name: stroke, dtype: int64
[[1086  372]
 [  20   55]]
              precision    recall  f1-score   support

    noStroke       0.98      0.74      0.85      1458
      stroke       0.13      0.73      0.22        75

    accuracy                           0.74      1533
   macro avg       0.56      0.74      0.53      1533
weighted avg       0.94      0.74      0.82      1533

0.7333333333333333
Logistic Regression Oversampling Recall Rate: 73.33%


In [5]:
def knn_SMOTE80():
    # splitting 80-20
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # oversampling minority class
    oversample = SMOTE()
    X_oversample, y_oversample = oversample.fit_resample(X_train, y_train)
    # checking oversampling counts
    print(y_oversample.value_counts())
    # checking testing counts
    print(y_test.value_counts())

    # create Classifier for K-Nearest Neighbours
    knn = KNeighborsClassifier(n_neighbors=60)

    # fit kNN into oversampled training set
    knn.fit(X_oversample, y_oversample)

    # predict labels of oversampled test dataset
    y_pred = knn.predict(X_test)

    # compute and print oversampled confusion matrix
    print(confusion_matrix(y_test, y_pred))
    # compute and print oversampled classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'K-Nearest Neighbours Oversampling Recall Rate: {recall*100:.2f}%')

In [6]:
knn_SMOTE80()

1    3889
0    3889
Name: stroke, dtype: int64
0    972
1     50
Name: stroke, dtype: int64
[[745 227]
 [ 13  37]]
              precision    recall  f1-score   support

    noStroke       0.98      0.77      0.86       972
      stroke       0.14      0.74      0.24        50

    accuracy                           0.77      1022
   macro avg       0.56      0.75      0.55      1022
weighted avg       0.94      0.77      0.83      1022

0.74
Logistic Regression Oversampling Recall Rate: 74.00%


In [7]:
def knn_SMOTE90():
    # splitting 90-10
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

    # oversampling minority class
    oversample = SMOTE()
    X_oversample, y_oversample = oversample.fit_resample(X_train, y_train)
    # checking oversampling counts
    print(y_oversample.value_counts())
    # checking testing counts
    print(y_test.value_counts())

    # create Classifier for K-Nearest Neighbours
    knn = KNeighborsClassifier(n_neighbors=60)

    # fit kNN into oversampled training set
    knn.fit(X_oversample, y_oversample)

    # predict labels of oversampled test dataset
    y_pred = knn.predict(X_test)

    # compute and print oversampled confusion matrix
    print(confusion_matrix(y_test, y_pred))
    # compute and print oversampled classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'K-Nearest Neighbours Oversampling Recall Rate: {recall*100:.2f}%')

In [8]:
knn_SMOTE90()

1    4375
0    4375
Name: stroke, dtype: int64
0    486
1     25
Name: stroke, dtype: int64
[[374 112]
 [  8  17]]
              precision    recall  f1-score   support

    noStroke       0.98      0.77      0.86       486
      stroke       0.13      0.68      0.22        25

    accuracy                           0.77       511
   macro avg       0.56      0.72      0.54       511
weighted avg       0.94      0.77      0.83       511

0.68
Logistic Regression Oversampling Recall Rate: 68.00%
