In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, recall_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
import matplotlib.pyplot as plt

In [2]:
strokeData = pd.read_csv("..\..\data\cleaned\cleanedDataset.csv")
X = strokeData.drop(columns='stroke', axis=1)
y = strokeData['stroke']

In [11]:
def combi20():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.2)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Support Vector Machine
    svm = SVC(kernel='linear')
    
    # fit training data into SVM
    svm.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = svm.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Support Vector Machine Combination Recall Rate: {recall*100:.2f}%')

In [13]:
combi20()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1     777
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 777})
0    863
1    777
Name: stroke, dtype: int64
Combination: Counter({0: 863, 1: 777})
[[733 239]
 [ 10  40]]
              precision    recall  f1-score   support

    noStroke       0.99      0.75      0.85       972
      stroke       0.14      0.80      0.24        50

    accuracy                           0.76      1022
   macro avg       0.56      0.78      0.55      1022
weighted avg       0.95      0.76      0.82      1022

0.8
Support Vector Machine Combination Recall Rate: 80.00%


In [16]:
def combi30():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.3)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Support Vector Machine
    svm = SVC(kernel='linear')
    
    # fit training data into SVM
    svm.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = svm.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Support Vector Machine Combination Recall Rate: {recall*100:.2f}%')

In [17]:
combi30()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    1166
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 1166})
0    1295
1    1166
Name: stroke, dtype: int64
Combination: Counter({0: 1295, 1: 1166})
[[725 247]
 [  9  41]]
              precision    recall  f1-score   support

    noStroke       0.99      0.75      0.85       972
      stroke       0.14      0.82      0.24        50

    accuracy                           0.75      1022
   macro avg       0.57      0.78      0.55      1022
weighted avg       0.95      0.75      0.82      1022

0.82
Support Vector Machine Combination Recall Rate: 82.00%


In [20]:
def combi40():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.4)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Support Vector Machine
    svm = SVC(kernel='linear')
    
    # fit training data into SVM
    svm.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = svm.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Support Vector Machine Combination Recall Rate: {recall*100:.2f}%')

In [22]:
combi40()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    1555
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 1555})
0    1727
1    1555
Name: stroke, dtype: int64
Combination: Counter({0: 1727, 1: 1555})
[[720 252]
 [ 10  40]]
              precision    recall  f1-score   support

    noStroke       0.99      0.74      0.85       972
      stroke       0.14      0.80      0.23        50

    accuracy                           0.74      1022
   macro avg       0.56      0.77      0.54      1022
weighted avg       0.94      0.74      0.82      1022

0.8
Support Vector Machine Combination Recall Rate: 80.00%


In [25]:
def combi50():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.5)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Support Vector Machine
    svm = SVC(kernel='linear')
    
    # fit training data into SVM
    svm.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = svm.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Support Vector Machine Combination Recall Rate: {recall*100:.2f}%')

In [26]:
combi50()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    1944
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 1944})
0    2160
1    1944
Name: stroke, dtype: int64
Combination: Counter({0: 2160, 1: 1944})
[[723 249]
 [  9  41]]
              precision    recall  f1-score   support

    noStroke       0.99      0.74      0.85       972
      stroke       0.14      0.82      0.24        50

    accuracy                           0.75      1022
   macro avg       0.56      0.78      0.54      1022
weighted avg       0.95      0.75      0.82      1022

0.82
Support Vector Machine Combination Recall Rate: 82.00%


In [29]:
def combi60():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.6)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Support Vector Machine
    svm = SVC(kernel='linear')
    
    # fit training data into SVM
    svm.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = svm.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Support Vector Machine Combination Recall Rate: {recall*100:.2f}%')

In [30]:
combi60()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    2333
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 2333})
0    2592
1    2333
Name: stroke, dtype: int64
Combination: Counter({0: 2592, 1: 2333})
[[722 250]
 [ 10  40]]
              precision    recall  f1-score   support

    noStroke       0.99      0.74      0.85       972
      stroke       0.14      0.80      0.24        50

    accuracy                           0.75      1022
   macro avg       0.56      0.77      0.54      1022
weighted avg       0.94      0.75      0.82      1022

0.8
Support Vector Machine Combination Recall Rate: 80.00%


In [3]:
def combi70():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.7)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.8)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Support Vector Machine
    svm = SVC(kernel='linear')
    
    # fit training data into SVM
    svm.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = svm.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Support Vector Machine Combination Recall Rate: {recall*100:.2f}%')

In [4]:
combi70()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    2722
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 2722})
0    3402
1    2722
Name: stroke, dtype: int64
Combination: Counter({0: 3402, 1: 2722})
[[743 229]
 [ 10  40]]
              precision    recall  f1-score   support

    noStroke       0.99      0.76      0.86       972
      stroke       0.15      0.80      0.25        50

    accuracy                           0.77      1022
   macro avg       0.57      0.78      0.56      1022
weighted avg       0.95      0.77      0.83      1022

0.8
Support Vector Machine Combination Recall Rate: 80.00%


In [35]:
def combi80():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.8)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Support Vector Machine
    svm = SVC(kernel='linear')
    
    # fit training data into SVM
    svm.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = svm.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Support Vector Machine Combination Recall Rate: {recall*100:.2f}%')

In [36]:
combi80()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    3111
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 3111})
0    3456
1    3111
Name: stroke, dtype: int64
Combination: Counter({0: 3456, 1: 3111})
[[717 255]
 [ 10  40]]
              precision    recall  f1-score   support

    noStroke       0.99      0.74      0.84       972
      stroke       0.14      0.80      0.23        50

    accuracy                           0.74      1022
   macro avg       0.56      0.77      0.54      1022
weighted avg       0.94      0.74      0.81      1022

0.8
Support Vector Machine Combination Recall Rate: 80.00%
