In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, recall_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

In [2]:
strokeData = pd.read_csv("..\..\data\cleaned\cleanedDataset.csv")
X = strokeData.drop(columns='stroke', axis=1)
y = strokeData['stroke']

In [5]:
def combi20():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.2)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Decision Trees
    dt = DecisionTreeClassifier(min_samples_leaf=34)
    
    # fit training data into DT
    dt.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = dt.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Decision Tree Combination Recall Rate: {recall*100:.2f}%')

In [6]:
combi20()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1     777
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 777})
0    863
1    777
Name: stroke, dtype: int64
Combination: Counter({0: 863, 1: 777})
[[754 218]
 [ 17  33]]
              precision    recall  f1-score   support

    noStroke       0.98      0.78      0.87       972
      stroke       0.13      0.66      0.22        50

    accuracy                           0.77      1022
   macro avg       0.55      0.72      0.54      1022
weighted avg       0.94      0.77      0.83      1022

0.66
Decision Tree Combination Recall Rate: 66.00%


In [3]:
def combi30():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.3)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Decision Trees
    dt = DecisionTreeClassifier(min_samples_leaf=34)
    
    # fit training data into DT
    dt.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = dt.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Decision Tree Combination Recall Rate: {recall*100:.2f}%')

In [4]:
combi30()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    1166
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 1166})
0    1295
1    1166
Name: stroke, dtype: int64
Combination: Counter({0: 1295, 1: 1166})
[[741 231]
 [ 16  34]]
              precision    recall  f1-score   support

    noStroke       0.98      0.76      0.86       972
      stroke       0.13      0.68      0.22        50

    accuracy                           0.76      1022
   macro avg       0.55      0.72      0.54      1022
weighted avg       0.94      0.76      0.83      1022

0.68
Decision Tree Combination Recall Rate: 68.00%


In [284]:
def combi40():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.4)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Decision Trees
    dt = DecisionTreeClassifier(min_samples_leaf=34)
    
    # fit training data into DT
    dt.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = dt.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Decision Tree Combination Recall Rate: {recall*100:.2f}%')

In [286]:
combi40()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    1555
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 1555})
0    1727
1    1555
Name: stroke, dtype: int64
Combination: Counter({0: 1727, 1: 1555})
[[890  82]
 [ 32  18]]
              precision    recall  f1-score   support

    noStroke       0.97      0.92      0.94       972
      stroke       0.18      0.36      0.24        50

    accuracy                           0.89      1022
   macro avg       0.57      0.64      0.59      1022
weighted avg       0.93      0.89      0.91      1022

0.36
Decision Tree Combination Recall Rate: 36.00%


In [287]:
def combi50():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.5)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.8)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Decision Trees
    dt = DecisionTreeClassifier(min_samples_leaf=34)
    
    # fit training data into DT
    dt.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = dt.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Decision Tree Combination Recall Rate: {recall*100:.2f}%')

In [328]:
combi50()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    1944
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 1944})
0    2430
1    1944
Name: stroke, dtype: int64
Combination: Counter({0: 2430, 1: 1944})
[[925  47]
 [ 37  13]]
              precision    recall  f1-score   support

    noStroke       0.96      0.95      0.96       972
      stroke       0.22      0.26      0.24        50

    accuracy                           0.92      1022
   macro avg       0.59      0.61      0.60      1022
weighted avg       0.93      0.92      0.92      1022

0.26
Decision Tree Combination Recall Rate: 26.00%


In [162]:
def combi60():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.6)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.8)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Decision Trees
    dt = DecisionTreeClassifier(min_samples_leaf=34)
    
    # fit training data into DT
    dt.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = dt.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Decision Tree Combination Recall Rate: {recall*100:.2f}%')

In [193]:
combi60()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    2333
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 2333})
0    2916
1    2333
Name: stroke, dtype: int64
Combination: Counter({0: 2916, 1: 2333})
[[933  39]
 [ 36  14]]
              precision    recall  f1-score   support

    noStroke       0.96      0.96      0.96       972
      stroke       0.26      0.28      0.27        50

    accuracy                           0.93      1022
   macro avg       0.61      0.62      0.62      1022
weighted avg       0.93      0.93      0.93      1022

0.28
Decision Tree Combination Recall Rate: 28.00%


In [269]:
def combi70():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.7)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Decision Trees
    dt = DecisionTreeClassifier(min_samples_leaf=34)
    
    # fit training data into DT
    dt.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = dt.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Decision Tree Combination Recall Rate: {recall*100:.2f}%')

In [276]:
combi70()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    2722
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 2722})
0    3024
1    2722
Name: stroke, dtype: int64
Combination: Counter({0: 3024, 1: 2722})
[[921  51]
 [ 38  12]]
              precision    recall  f1-score   support

    noStroke       0.96      0.95      0.95       972
      stroke       0.19      0.24      0.21        50

    accuracy                           0.91      1022
   macro avg       0.58      0.59      0.58      1022
weighted avg       0.92      0.91      0.92      1022

0.24
Decision Tree Combination Recall Rate: 24.00%


In [256]:
def combi80():
    # splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
    print(y_train.value_counts())
    
    # oversampling training data
    over = RandomOverSampler(sampling_strategy=0.8)
    X_over, y_over = over.fit_resample(X_train, y_train)
    print(y_over.value_counts())
    print(f"Oversampling: {Counter(y_over)}")
    
    # undersampling oversampled training data
    under = RandomUnderSampler(sampling_strategy=0.9)
    X_under, y_under = under.fit_resample(X_over, y_over)
    print(y_under.value_counts())
    print(f"Combination: {Counter(y_under)}")
    
    # create Classifier for Decision Trees
    dt = DecisionTreeClassifier(min_samples_leaf=34)
    
    # fit training data into DT
    dt.fit(X_under, y_under)

    # predict labels of test dataset
    y_pred = dt.predict(X_test)

    # compute and print confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # compute and print classification report
    print(classification_report(y_test, y_pred, target_names=['noStroke', 'stroke']))

    # print recall rate (metric we are looking at)
    recall = recall_score(y_test, y_pred)
    print(recall)
    print(f'Decision Tree Combination Recall Rate: {recall*100:.2f}%')

In [268]:
combi80()

0    3889
1     199
Name: stroke, dtype: int64
0    3889
1    3111
Name: stroke, dtype: int64
Oversampling: Counter({0: 3889, 1: 3111})
0    3456
1    3111
Name: stroke, dtype: int64
Combination: Counter({0: 3456, 1: 3111})
[[937  35]
 [ 40  10]]
              precision    recall  f1-score   support

    noStroke       0.96      0.96      0.96       972
      stroke       0.22      0.20      0.21        50

    accuracy                           0.93      1022
   macro avg       0.59      0.58      0.59      1022
weighted avg       0.92      0.93      0.92      1022

0.2
Decision Tree Combination Recall Rate: 20.00%
