In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, recall_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.neighbors import KNeighborsClassifier

In [2]:
strokeData = pd.read_csv("..\..\data\cleaned\cleanedDataset.csv")
X = strokeData.drop(columns='stroke', axis=1)
y = strokeData['stroke']

In [32]:
def combi20():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling
    over = RandomOverSampler(sampling_strategy=0.2)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for K-Nearest Neighbours
    knn = KNeighborsClassifier(n_neighbors=60)
    
    # fit training data into kNN
    knn.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = knn.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'K-Nearest Neighbours Combination Recall Rate: {recall*100:.2f}%')

In [106]:
combi20()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1     777
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 777})
0    863
1    777
Name: stroke, dtype: int64
Combination: Counter({0: 863, 1: 777})
[[673 299]
 [ 10  40]]
              precision    recall  f1-score   support

    noStroke       0.99      0.69      0.81       972
      stroke       0.12      0.80      0.21        50

    accuracy                           0.70      1022
   macro avg       0.55      0.75      0.51      1022
weighted avg       0.94      0.70      0.78      1022

0.8
K-Nearest Neighbours Combination Recall Rate: 80.00%


In [101]:
def combi30():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling
    over = RandomOverSampler(sampling_strategy=0.3)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling
    under = RandomUnderSampler(sampling_strategy=0.8)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for K-Nearest Neighbours
    knn = KNeighborsClassifier(n_neighbors=60)
    
    # fit training data into kNN
    knn.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = knn.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'K-Nearest Neighbours Combination Recall Rate: {recall*100:.2f}%')

In [104]:
combi30()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    1166
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 1166})
0    1457
1    1166
Name: stroke, dtype: int64
Combination: Counter({0: 1457, 1: 1166})
[[681 291]
 [ 10  40]]
              precision    recall  f1-score   support

    noStroke       0.99      0.70      0.82       972
      stroke       0.12      0.80      0.21        50

    accuracy                           0.71      1022
   macro avg       0.55      0.75      0.51      1022
weighted avg       0.94      0.71      0.79      1022

0.8
K-Nearest Neighbours Combination Recall Rate: 80.00%


In [55]:
def combi40():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling
    over = RandomOverSampler(sampling_strategy=0.4)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for K-Nearest Neighbours
    knn = KNeighborsClassifier(n_neighbors=60)
    
    # fit training data into kNN
    knn.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = knn.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'K-Nearest Neighbours Combination Recall Rate: {recall*100:.2f}%')

In [100]:
combi40()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    1555
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 1555})
0    1727
1    1555
Name: stroke, dtype: int64
Combination: Counter({0: 1727, 1: 1555})
[[682 290]
 [ 10  40]]
              precision    recall  f1-score   support

    noStroke       0.99      0.70      0.82       972
      stroke       0.12      0.80      0.21        50

    accuracy                           0.71      1022
   macro avg       0.55      0.75      0.52      1022
weighted avg       0.94      0.71      0.79      1022

0.8
K-Nearest Neighbours Combination Recall Rate: 80.00%


In [98]:
def combi50():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling
    over = RandomOverSampler(sampling_strategy=0.5)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling
    under = RandomUnderSampler(sampling_strategy=0.8)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for K-Nearest Neighbours
    knn = KNeighborsClassifier(n_neighbors=60)
    
    # fit training data into kNN
    knn.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = knn.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'K-Nearest Neighbours Combination Recall Rate: {recall*100:.2f}%')

In [99]:
combi50()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    1944
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 1944})
0    2430
1    1944
Name: stroke, dtype: int64
Combination: Counter({0: 2430, 1: 1944})
[[707 265]
 [ 12  38]]
              precision    recall  f1-score   support

    noStroke       0.98      0.73      0.84       972
      stroke       0.13      0.76      0.22        50

    accuracy                           0.73      1022
   macro avg       0.55      0.74      0.53      1022
weighted avg       0.94      0.73      0.81      1022

0.76
K-Nearest Neighbours Combination Recall Rate: 76.00%


In [96]:
def combi60():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling
    over = RandomOverSampler(sampling_strategy=0.6)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for K-Nearest Neighbours
    knn = KNeighborsClassifier(n_neighbors=60)
    
    # fit training data into kNN
    knn.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = knn.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'K-Nearest Neighbours Combination Recall Rate: {recall*100:.2f}%')

In [97]:
combi60()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    2333
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 2333})
0    2592
1    2333
Name: stroke, dtype: int64
Combination: Counter({0: 2592, 1: 2333})
[[687 285]
 [ 12  38]]
              precision    recall  f1-score   support

    noStroke       0.98      0.71      0.82       972
      stroke       0.12      0.76      0.20        50

    accuracy                           0.71      1022
   macro avg       0.55      0.73      0.51      1022
weighted avg       0.94      0.71      0.79      1022

0.76
K-Nearest Neighbours Combination Recall Rate: 76.00%


In [94]:
def combi70():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling
    over = RandomOverSampler(sampling_strategy=0.7)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for K-Nearest Neighbours
    knn = KNeighborsClassifier(n_neighbors=60)
    
    # fit training data into kNN
    knn.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = knn.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'K-Nearest Neighbours Combination Recall Rate: {recall*100:.2f}%')

In [95]:
combi70()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    2722
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 2722})
0    3024
1    2722
Name: stroke, dtype: int64
Combination: Counter({0: 3024, 1: 2722})
[[699 273]
 [ 12  38]]
              precision    recall  f1-score   support

    noStroke       0.98      0.72      0.83       972
      stroke       0.12      0.76      0.21        50

    accuracy                           0.72      1022
   macro avg       0.55      0.74      0.52      1022
weighted avg       0.94      0.72      0.80      1022

0.76
K-Nearest Neighbours Combination Recall Rate: 76.00%


In [89]:
def combi80():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling
    over = RandomOverSampler(sampling_strategy=0.8)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for K-Nearest Neighbours
    knn = KNeighborsClassifier(n_neighbors=60)
    
    # fit training data into kNN
    knn.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = knn.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'K-Nearest Neighbours Combination Recall Rate: {recall*100:.2f}%')

In [93]:
combi80()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    3111
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 3111})
0    3456
1    3111
Name: stroke, dtype: int64
Combination: Counter({0: 3456, 1: 3111})
[[678 294]
 [ 13  37]]
              precision    recall  f1-score   support

    noStroke       0.98      0.70      0.82       972
      stroke       0.11      0.74      0.19        50

    accuracy                           0.70      1022
   macro avg       0.55      0.72      0.50      1022
weighted avg       0.94      0.70      0.79      1022

0.74
K-Nearest Neighbours Combination Recall Rate: 74.00%
