In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [3]:
df_features = pd.read_csv("Features/df_all_features.csv")
df_targets = pd.read_csv("Features/df_all_targets.csv")

In [4]:
X = df_features
y = df_targets.values.ravel()

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.3, 
    random_state=42
)
X_test_f, X_val, y_test_f, y_val = train_test_split(
    X_test, 
    y_test, 
    test_size=0.33, 
    random_state=42
)

In [19]:
from numpy import mean
from imblearn.ensemble import BalancedRandomForestClassifier
model_balanced = BalancedRandomForestClassifier(n_estimators=200)
model_balanced.fit(X_train, y_train)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model_balanced, X_test_f, y_test_f, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print(f'Random Forest Model\'s accuracy on training set is {100*model_balanced.score(X_train, y_train):.2f}%')
print(f'Random Forest Model\'s accuracy on test set is {100*model_balanced.score(X_test_f, y_test_f):.2f}%')
print('Mean ROC AUC: %.3f' % mean(scores))

Random Forest Model's accuracy on training set is 96.78%
Random Forest Model's accuracy on test set is 89.76%
Mean ROC AUC: 0.930


In [20]:
from imblearn.combine import SMOTEENN
smenn = SMOTEENN()
X_train_smenn, y_train_smenn = smenn.fit_resample(X_train, y_train)

In [21]:
from numpy import mean
from imblearn.ensemble import BalancedRandomForestClassifier
model_balanced = BalancedRandomForestClassifier(n_estimators=200)
model_balanced.fit(X_train_smenn, y_train_smenn)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model_balanced, X_test_f, y_test_f, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print(f'Random Forest Model\'s accuracy on training set is {100*model_balanced.score(X_train_smenn, y_train_smenn):.2f}%')
print(f'Random Forest Model\'s accuracy on test set is {100*model_balanced.score(X_test_f, y_test_f):.2f}%')
print('Mean ROC AUC: %.3f' % mean(scores))

Random Forest Model's accuracy on training set is 99.97%
Random Forest Model's accuracy on test set is 90.82%
Mean ROC AUC: 0.929


In [22]:
from sklearn.metrics import classification_report
y_pred_balanced = model_balanced.predict(X_test_f)
print(classification_report(y_test_f, y_pred_balanced))

              precision    recall  f1-score   support

           0       0.73      0.79      0.76      2011
           1       0.95      0.94      0.94      8855

    accuracy                           0.91     10866
   macro avg       0.84      0.86      0.85     10866
weighted avg       0.91      0.91      0.91     10866



In [8]:
from numpy import mean
model = RandomForestClassifier(bootstrap = False, criterion='entropy', n_estimators=100)
model.fit(X_train_smenn, y_train_smenn)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X_test_f, y_test_f, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print(f'Random Forest Model\'s accuracy on training set is {100*model.score(X_train_smenn, y_train_smenn):.2f}%')
print(f'Random Forest Model\'s accuracy on test set is {100*model.score(X_test_f, y_test_f):.2f}%')
print('Mean ROC AUC: %.3f' % mean(scores))

Random Forest Model's accuracy on training set is 100.00%
Random Forest Model's accuracy on test set is 90.70%
Mean ROC AUC: 0.942


In [10]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_f)
print(classification_report(y_test_f, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.83      0.77      2011
           1       0.96      0.92      0.94      8855

    accuracy                           0.91     10866
   macro avg       0.84      0.88      0.85     10866
weighted avg       0.91      0.91      0.91     10866



In [11]:
y_pred_val = model.predict(X_val)
print(classification_report(y_val, y_pred_val))

              precision    recall  f1-score   support

           0       0.72      0.84      0.77       997
           1       0.96      0.92      0.94      4355

    accuracy                           0.91      5352
   macro avg       0.84      0.88      0.86      5352
weighted avg       0.92      0.91      0.91      5352

