In [7]:
import numpy as np
import pandas as pd
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [9]:
from sklearn.metrics import confusion_matrix

In [10]:
df_features = pd.read_csv("Features/df_all_features.csv")
df_targets = pd.read_csv("Features/df_all_targets.csv")

In [11]:
X = df_features
y = df_targets.values.ravel()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=53
)

In [13]:
from sklearn.ensemble import RandomForestClassifier

########## Tuned Random Forest #######
model = RandomForestClassifier(
    n_estimators = 500, 
    criterion ='entropy',
    warm_start = True,
    max_features = 'sqrt',
    oob_score = 'True', # more on this below
    random_state=69  
) 

model.fit(X_train, y_train)

print(f'Random Forest Model\'s accuracy on training set is {100*model.score(X_train, y_train):.2f}%')
print(f'Random Forest Model\'s accuracy on test set is {100*model.score(X_test, y_test):.2f}%')

Random Forest Model's accuracy on training set is 100.00%
Random Forest Model's accuracy on test set is 93.19%


In [14]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.65      0.78      2000
           1       0.93      1.00      0.96      8812

    accuracy                           0.93     10812
   macro avg       0.95      0.82      0.87     10812
weighted avg       0.94      0.93      0.93     10812



In [20]:
from imblearn.ensemble import BalancedRandomForestClassifier

In [21]:
X_balanced = df_features
y_balanced = df_targets.values.ravel()

In [22]:
X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = train_test_split(
    X_balanced, 
    y_balanced, 
    test_size=0.2, 
    random_state=42
)

In [23]:
from numpy import mean
model_balanced = BalancedRandomForestClassifier(n_estimators=200)
model_balanced.fit(X_train_balanced, y_train_balanced)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model_balanced, X_test_balanced, y_test_balanced, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print(f'Random Forest Model\'s accuracy on training set is {100*model_balanced.score(X_train_balanced, y_train_balanced):.2f}%')
print(f'Random Forest Model\'s accuracy on test set is {100*model_balanced.score(X_test_balanced, y_test_balanced):.2f}%')
print('Mean ROC AUC: %.3f' % mean(scores))

Random Forest Model's accuracy on training set is 96.84%
Random Forest Model's accuracy on test set is 89.96%
Mean ROC AUC: 0.933


In [19]:
from sklearn.metrics import classification_report
y_pred_balanced = model_balanced.predict(X_test_balanced)
print(classification_report(y_test_balanced, y_pred_balanced))

              precision    recall  f1-score   support

           0       0.55      0.88      0.67      1997
           1       0.97      0.84      0.90      8815

    accuracy                           0.84     10812
   macro avg       0.76      0.86      0.79     10812
weighted avg       0.89      0.84      0.86     10812

