In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, recall_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC

In [2]:
strokeData = pd.read_csv("..\..\data\cleaned\cleanedDataset.csv")
X = strokeData.drop(columns='stroke', axis=1)
y = strokeData['stroke']

In [3]:
def svm_RUS70():
    # splitting 70-30
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # undersampling majority class
    undersample = RandomUnderSampler(sampling_strategy='majority')
    X_undersample, y_undersample = undersample.fit_resample(X_train, y_train)
    # checking undersampled counts
    print(y_undersample.value_counts())
    # checking testing counts
    print(y_test.value_counts())

    # create Classifier for Support Vector Machine
    svm = SVC(kernel='linear')

    # fit svm into undersampled training set
    svm.fit(X_undersample, y_undersample)

    # predict labels of undersampled test dataset
    y_pred = svm.predict(X_test)

    # compute and print undersampled confusion matrix
    print(confusion_matrix(y_test, y_pred))
    # compute and print undersampled classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Support Vector Machine Undersampling Recall Rate: {recall*100:.2f}%')

In [4]:
svm_RUS70()

1    174
0    174
Name: stroke, dtype: int64
0    1458
1      75
Name: stroke, dtype: int64
[[1034  424]
 [  14   61]]
              precision    recall  f1-score   support

    noStroke       0.99      0.71      0.83      1458
      stroke       0.13      0.81      0.22        75

    accuracy                           0.71      1533
   macro avg       0.56      0.76      0.52      1533
weighted avg       0.94      0.71      0.80      1533

0.8133333333333334
Support Vector Machine Undersampling Recall Rate: 81.33%


In [5]:
def svm_RUS80():
    # splitting 80-20
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # undersampling majority class
    undersample = RandomUnderSampler(sampling_strategy='majority')
    X_undersample, y_undersample = undersample.fit_resample(X_train, y_train)
    # checking undersampled counts
    print(y_undersample.value_counts())
    # checking testing counts
    print(y_test.value_counts())

    # create Classifier for Support Vector Machine
    svm = SVC(kernel='linear')

    # fit svm into undersampled training set
    svm.fit(X_undersample, y_undersample)

    # predict labels of undersampled test dataset
    y_pred = svm.predict(X_test)

    # compute and print undersampled confusion matrix
    print(confusion_matrix(y_test, y_pred))
    # compute and print undersampled classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Support Vector Machine Undersampling Recall Rate: {recall*100:.2f}%')

In [6]:
svm_RUS80()

1    199
0    199
Name: stroke, dtype: int64
0    972
1     50
Name: stroke, dtype: int64
[[724 248]
 [  9  41]]
              precision    recall  f1-score   support

    noStroke       0.99      0.74      0.85       972
      stroke       0.14      0.82      0.24        50

    accuracy                           0.75      1022
   macro avg       0.56      0.78      0.55      1022
weighted avg       0.95      0.75      0.82      1022

0.82
Support Vector Machine Undersampling Recall Rate: 82.00%


In [7]:
def svm_RUS90():
    # splitting 90-10
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

    # undersampling majority class
    undersample = RandomUnderSampler(sampling_strategy='majority')
    X_undersample, y_undersample = undersample.fit_resample(X_train, y_train)
    # checking undersampled counts
    print(y_undersample.value_counts())
    # checking testing counts
    print(y_test.value_counts())

    # create Classifier for Support Vector Machine
    svm = SVC(kernel='linear')

    # fit svm into undersampled training set
    svm.fit(X_undersample, y_undersample)

    # predict labels of undersampled test dataset
    y_pred = svm.predict(X_test)

    # compute and print undersampled confusion matrix
    print(confusion_matrix(y_test, y_pred))
    # compute and print undersampled classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Support Vector Machine Undersampling Recall Rate: {recall*100:.2f}%')

In [8]:
svm_RUS90()

1    224
0    224
Name: stroke, dtype: int64
0    486
1     25
Name: stroke, dtype: int64
[[360 126]
 [  6  19]]
              precision    recall  f1-score   support

    noStroke       0.98      0.74      0.85       486
      stroke       0.13      0.76      0.22        25

    accuracy                           0.74       511
   macro avg       0.56      0.75      0.53       511
weighted avg       0.94      0.74      0.81       511

0.76
Support Vector Machine Undersampling Recall Rate: 76.00%
