In [None]:
import numpy as np
import pandas as pd
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [45]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [46]:
df_fan_features = pd.read_csv("Features/df_fan_feature.csv")
df_fan_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0.321765,0.99392,0.587307,0.227931,3.682876e-07,0.454453,0.012064,0.059049,-439.074188,151.212128,...,0.018938,0.002594,0.006821,0.01163,0.009338,0.001038,0.008301,0.038574,0.022062,0.00488
1,0.324179,0.997664,0.575122,0.233173,5.033161e-07,0.581692,0.015197,0.076022,-440.888489,155.448624,...,0.021426,0.003007,0.007374,0.013181,0.010397,0.00102,0.009766,0.031738,0.019542,0.003963
2,0.309565,0.99241,0.583972,0.245409,3.053722e-07,0.515054,0.013476,0.067214,-440.551361,152.642548,...,0.020326,0.002495,0.007279,0.012313,0.009822,0.00086,0.009766,0.034668,0.020863,0.004406
3,0.321053,0.994196,0.576258,0.235442,3.724527e-08,0.570727,0.015205,0.074875,-448.64621,170.903778,...,0.020827,0.00256,0.007297,0.013035,0.010333,0.001054,0.006348,0.028809,0.015656,0.003357
4,0.347573,0.981656,0.593205,0.225728,9.834761e-08,0.491109,0.013466,0.065463,-442.838745,163.274231,...,0.019928,0.002771,0.007218,0.013747,0.009737,0.001146,0.004883,0.02832,0.016588,0.003945


In [47]:
df_fan_target = pd.read_csv("Features/df_fan_target.csv")
df_fan_target.tail()

Unnamed: 0,0
16645,1
16646,1
16647,1
16648,1
16649,1


In [48]:
X = df_fan_features
y = df_fan_target.values.ravel()

In [49]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=69
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size=0.05, 
    random_state=69
)

In [None]:
from numpy import mean
# define model
model = RandomForestClassifier(
    n_estimators = 500, 
    criterion ='entropy',
    max_features = 'sqrt',
    oob_score = 'True', # more on this below
    random_state=69  , 
    class_weight='balanced')
model.fit(X_train, y_train)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print(f'Average score: {"{:.2f}".format(scores.mean())}')
print('Mean ROC AUC: %.3f' % mean(scores))
print(f'Random Forest Model\'s accuracy on training set is {100*model.score(X_train, y_train):.2f}%')
print(f'Random Forest Model\'s accuracy on test set is {100*model.score(X_test, y_test):.2f}%')
print(f'Random Forest Model\'s accuracy on validation set is {100*model.score(X_val, y_val):.2f}%')

In [14]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.78      0.87       880
           1       0.93      0.99      0.96      2450

    accuracy                           0.94      3330
   macro avg       0.95      0.89      0.91      3330
weighted avg       0.94      0.94      0.94      3330



In [15]:
from sklearn.metrics import classification_report
y_pred_val = model.predict(X_val)
print(classification_report(y_val, y_pred_val))

              precision    recall  f1-score   support

           0       0.98      0.78      0.87       167
           1       0.93      0.99      0.96       499

    accuracy                           0.94       666
   macro avg       0.95      0.89      0.92       666
weighted avg       0.94      0.94      0.94       666



In [16]:
df_pump_features = pd.read_csv("Features/df_pump_feature.csv")
df_pump_target = pd.read_csv("Features/df_pump_target.csv")

In [17]:
X = df_pump_features
y = df_pump_target.values.ravel()

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=69
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size=0.05, 
    random_state=69
)

In [19]:
model = RandomForestClassifier(
    n_estimators = 500, 
    criterion ='entropy',
    max_features = 'sqrt',
    oob_score = 'True', # more on this below
    random_state=69  , 
    class_weight='balanced')
model.fit(X_train, y_train)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)

In [20]:
print(f'Random Forest Model\'s accuracy on training set is {100*model.score(X_train, y_train):.2f}%')
print(f'Random Forest Model\'s accuracy on test set is {100*model.score(X_test, y_test):.2f}%')
print(f'Random Forest Model\'s accuracy on validation set is {100*model.score(X_val, y_val):.2f}%')

Random Forest Model's accuracy on training set is 100.00%
Random Forest Model's accuracy on test set is 94.37%
Random Forest Model's accuracy on validation set is 94.46%


In [21]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.49      0.66       275
           1       0.94      1.00      0.97      2248

    accuracy                           0.94      2523
   macro avg       0.96      0.75      0.81      2523
weighted avg       0.95      0.94      0.94      2523



In [22]:
y_pred_val = model.predict(X_val)
print(classification_report(y_val, y_pred_val))

              precision    recall  f1-score   support

           0       1.00      0.50      0.67        56
           1       0.94      1.00      0.97       449

    accuracy                           0.94       505
   macro avg       0.97      0.75      0.82       505
weighted avg       0.95      0.94      0.94       505



In [29]:
df_pump_features.corr().style.background_gradient(cmap="coolwarm")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
0,1.0,-0.638945,0.957114,-0.958539,0.116547,-0.091687,0.193252,-0.021999,0.414732,0.003664,0.497477,-0.415229,0.042585,0.265788,0.13088,0.320554,-0.013427,0.075573,0.230221,0.142691,0.21476,0.037755,0.121789,0.068895,0.121064,0.136875,0.259089,0.159239,0.312007
1,-0.638945,1.0,-0.645941,0.727758,-0.136641,0.022634,-0.2069,-0.062501,-0.322256,0.061168,-0.207731,0.339741,0.017533,-0.173723,-0.078588,-0.266705,0.009643,-0.114623,-0.325853,-0.223058,-0.348728,-0.069294,-0.28256,-0.170067,-0.332377,-0.035888,-0.13882,-0.054954,-0.263626
2,0.957114,-0.645941,1.0,-0.91182,0.116123,-0.135954,0.150726,-0.066083,0.384848,-0.053074,0.436908,-0.397902,0.089974,0.309723,0.17729,0.322108,0.002952,0.056235,0.213449,0.120233,0.198345,0.007524,0.096468,0.037435,0.101681,0.185815,0.301126,0.206065,0.316859
3,-0.958539,0.727758,-0.91182,1.0,-0.106304,0.113486,-0.180835,0.03009,-0.367226,-0.01431,-0.427836,0.366098,0.022257,-0.240559,-0.084699,-0.357907,0.039951,-0.043615,-0.26151,-0.136279,-0.266635,-0.008005,-0.159432,-0.066809,-0.191174,-0.08364,-0.218945,-0.107882,-0.32054
4,0.116547,-0.136641,0.116123,-0.106304,1.0,0.020802,0.116516,0.04991,0.433411,-0.387099,0.128281,-0.523375,0.380754,0.315388,0.446566,0.145191,-0.002241,0.332812,0.364753,0.368653,0.327006,0.240579,0.353258,0.299019,0.30994,0.368261,0.425138,0.407893,0.455627
5,-0.091687,0.022634,-0.135954,0.113486,0.020802,1.0,0.84741,0.970133,0.529349,0.48843,0.504984,-0.410888,-0.415303,-0.52207,-0.457535,-0.231072,-0.002384,0.643916,0.417441,0.66648,0.358754,0.732706,0.564234,0.740517,0.336611,-0.484792,-0.486463,-0.494869,-0.236182
6,0.193252,-0.2069,0.150726,-0.180835,0.116516,0.84741,1.0,0.923873,0.777651,0.498318,0.699452,-0.655921,-0.401881,-0.496601,-0.432906,-0.205536,0.003769,0.818566,0.590865,0.860982,0.508677,0.886716,0.717932,0.902454,0.44008,-0.44076,-0.421727,-0.449749,-0.18718
7,-0.021999,-0.062501,-0.066083,0.03009,0.04991,0.970133,0.923873,1.0,0.600188,0.517658,0.553884,-0.47618,-0.434412,-0.530896,-0.471157,-0.218964,-0.006427,0.702164,0.473102,0.734731,0.409845,0.789794,0.630003,0.806884,0.388243,-0.500393,-0.484158,-0.508967,-0.219027
8,0.414732,-0.322256,0.384848,-0.367226,0.433411,0.529349,0.777651,0.600188,1.0,0.092913,0.729305,-0.974689,-0.024829,-0.166193,-0.02541,-0.129132,0.102526,0.871265,0.667255,0.903954,0.565448,0.843974,0.717363,0.863379,0.455315,-0.059897,-0.051494,-0.053462,0.05254
9,0.003664,0.061168,-0.053074,-0.01431,-0.387099,0.48843,0.498318,0.517658,0.092913,1.0,0.516536,0.125366,-0.86346,-0.733731,-0.918311,-0.129354,-0.203113,0.190651,0.017773,0.182868,-0.029123,0.351456,0.116176,0.293489,-0.057938,-0.844254,-0.806946,-0.88451,-0.468307


In [31]:
from imblearn.ensemble import BalancedRandomForestClassifier

In [33]:
X = df_pump_features
y = df_pump_target.values.ravel()

In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=69
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size=0.05, 
    random_state=69
)

In [41]:
model = BalancedRandomForestClassifier(n_estimators=10)
model.fit(X_train, y_train)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X_val, y_val, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.764


In [42]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.33      0.82      0.47       275
           1       0.97      0.80      0.88      2248

    accuracy                           0.80      2523
   macro avg       0.65      0.81      0.68      2523
weighted avg       0.90      0.80      0.83      2523



In [43]:
y_pred_val = model.predict(X_val)
print(classification_report(y_val, y_pred_val))

              precision    recall  f1-score   support

           0       0.32      0.80      0.46        56
           1       0.97      0.79      0.87       449

    accuracy                           0.79       505
   macro avg       0.65      0.80      0.66       505
weighted avg       0.90      0.79      0.82       505

