In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, recall_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt

In [2]:
strokeData = pd.read_csv("..\..\data\cleaned\cleanedDataset.csv")
X = strokeData.drop(columns='stroke', axis=1)
y = strokeData['stroke']

In [18]:
def combi20():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.2)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Neural Networks
    nn = MLPClassifier(max_iter=1000)
    
    # fit training data into NN
    nn.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = nn.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Neural Network Combination Recall Rate: {recall*100:.2f}%')

In [21]:
combi20()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1     777
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 777})
0    863
1    777
Name: stroke, dtype: int64
Combination: Counter({0: 863, 1: 777})
[[757 215]
 [ 10  40]]
              precision    recall  f1-score   support

    noStroke       0.99      0.78      0.87       972
      stroke       0.16      0.80      0.26        50

    accuracy                           0.78      1022
   macro avg       0.57      0.79      0.57      1022
weighted avg       0.95      0.78      0.84      1022

0.8
Neural Network Combination Recall Rate: 80.00%


In [85]:
def combi30():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.3)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.8)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Neural Networks
    nn = MLPClassifier(max_iter=1000)
    
    # fit training data into NN
    nn.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = nn.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Neural Network Combination Recall Rate: {recall*100:.2f}%')

In [87]:
combi30()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    1166
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 1166})
0    1457
1    1166
Name: stroke, dtype: int64
Combination: Counter({0: 1457, 1: 1166})
[[719 253]
 [ 10  40]]
              precision    recall  f1-score   support

    noStroke       0.99      0.74      0.85       972
      stroke       0.14      0.80      0.23        50

    accuracy                           0.74      1022
   macro avg       0.56      0.77      0.54      1022
weighted avg       0.94      0.74      0.82      1022

0.8
Neural Network Combination Recall Rate: 80.00%


In [90]:
def combi40():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.4)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.8)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Neural Networks
    nn = MLPClassifier(max_iter=1000)
    
    # fit training data into NN
    nn.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = nn.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Neural Network Combination Recall Rate: {recall*100:.2f}%')

In [92]:
combi40()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    1555
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 1555})
0    1943
1    1555
Name: stroke, dtype: int64
Combination: Counter({0: 1943, 1: 1555})
[[753 219]
 [ 10  40]]
              precision    recall  f1-score   support

    noStroke       0.99      0.77      0.87       972
      stroke       0.15      0.80      0.26        50

    accuracy                           0.78      1022
   macro avg       0.57      0.79      0.56      1022
weighted avg       0.95      0.78      0.84      1022

0.8
Neural Network Combination Recall Rate: 80.00%


In [93]:
def combi50():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.5)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.8)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Neural Networks
    nn = MLPClassifier(max_iter=1000)
    
    # fit training data into NN
    nn.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = nn.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Neural Network Combination Recall Rate: {recall*100:.2f}%')

In [95]:
combi50()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    1944
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 1944})
0    2430
1    1944
Name: stroke, dtype: int64
Combination: Counter({0: 2430, 1: 1944})
[[766 206]
 [ 14  36]]
              precision    recall  f1-score   support

    noStroke       0.98      0.79      0.87       972
      stroke       0.15      0.72      0.25        50

    accuracy                           0.78      1022
   macro avg       0.57      0.75      0.56      1022
weighted avg       0.94      0.78      0.84      1022

0.72
Neural Network Combination Recall Rate: 72.00%


In [69]:
def combi60():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.6)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.8)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Neural Networks
    nn = MLPClassifier(max_iter=1000)
    
    # fit training data into NN
    nn.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = nn.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Neural Network Combination Recall Rate: {recall*100:.2f}%')

In [73]:
combi60()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    2333
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 2333})
0    2916
1    2333
Name: stroke, dtype: int64
Combination: Counter({0: 2916, 1: 2333})
[[773 199]
 [  9  41]]
              precision    recall  f1-score   support

    noStroke       0.99      0.80      0.88       972
      stroke       0.17      0.82      0.28        50

    accuracy                           0.80      1022
   macro avg       0.58      0.81      0.58      1022
weighted avg       0.95      0.80      0.85      1022

0.82
Neural Network Combination Recall Rate: 82.00%


In [78]:
def combi70():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.7)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.8)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Neural Networks
    nn = MLPClassifier(max_iter=1000)
    
    # fit training data into NN
    nn.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = nn.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Neural Network Combination Recall Rate: {recall*100:.2f}%')

In [81]:
combi70()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    2722
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 2722})
0    3402
1    2722
Name: stroke, dtype: int64
Combination: Counter({0: 3402, 1: 2722})
[[782 190]
 [ 13  37]]
              precision    recall  f1-score   support

    noStroke       0.98      0.80      0.89       972
      stroke       0.16      0.74      0.27        50

    accuracy                           0.80      1022
   macro avg       0.57      0.77      0.58      1022
weighted avg       0.94      0.80      0.85      1022

0.74
Neural Network Combination Recall Rate: 74.00%


In [82]:
def combi80():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.8)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Neural Networks
    nn = MLPClassifier(max_iter=1000)
    
    # fit training data into NN
    nn.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = nn.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Neural Network Combination Recall Rate: {recall*100:.2f}%')

In [84]:
combi80()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    3111
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 3111})
0    3456
1    3111
Name: stroke, dtype: int64
Combination: Counter({0: 3456, 1: 3111})
[[775 197]
 [ 14  36]]
              precision    recall  f1-score   support

    noStroke       0.98      0.80      0.88       972
      stroke       0.15      0.72      0.25        50

    accuracy                           0.79      1022
   macro avg       0.57      0.76      0.57      1022
weighted avg       0.94      0.79      0.85      1022

0.72
Neural Network Combination Recall Rate: 72.00%
