In [34]:
import pandas as pd
from numpy import mean

from sklearn.model_selection import train_test_split


from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report

import joblib
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve, average_precision_score, auc
import time

## Load Features and Targets

In [49]:
FF=pd.read_csv('../Features/df_fan_feature.csv')
FP=pd.read_csv('../Features/df_pump_feature.csv')
FS=pd.read_csv('../Features/df_slider_feature.csv')
FV=pd.read_csv('../Features/df_valve_feature.csv')
TF=pd.read_csv('../Features/df_fan_target.csv')
TP=pd.read_csv('../Features/df_pump_target.csv')
TS=pd.read_csv('../Features/df_slider_target.csv')
TV=pd.read_csv('../Features/df_valve_target.csv')
Fall=pd.read_csv('../Features/df_all_features.csv')
Tall=pd.read_csv('../Features/df_all_targets.csv')
Machine=pd.read_csv('../Features/mach_type.csv')


## Dataset splitting

In [19]:
#Slit Dataset in 3. Test (70%) , Train ( 2%) and Validation set (10%)
def split(F, T):
    X_train , X_test, y_train, y_test  = train_test_split( F, T.values.ravel(), random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.33, random_state=42)
    return X_train, y_train, X_test, y_test, X_val, y_val

#For all
X_train_all, y_train_all, X_test_all, y_test_all, X_val_all, y_val_all=split(Fall, Tall)
#For FAN
X_train_fan, y_train_fan, X_test_fan, y_test_fan, X_val_fan, y_val_fan=split(FF, TF)
#For VALVE
X_train_valve, y_train_valve, X_test_valve, y_test_valve, X_val_valve, y_val_valve=split(FV, TV)
#For PUMP
X_train_pump, y_train_pump, X_test_pump, y_test_pump, X_val_pump, y_val_pump=split(FP, TP)
#For SLIDER
X_train_slider, y_train_slider, X_test_slider, y_test_slider, X_val_slider, y_val_slider=split(FS, TS)


## Models

In [54]:
def Balanced_RF(a, b, c, d, e, f): # a, b= Training sets / c,d = testing sets / e, f = validation sets
    start = time.process_time()
    model = BalancedRandomForestClassifier(n_estimators=200)
    #X_train, y_train
    model.fit(a, b)
    #evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
    #model evaluation
    scores= cross_val_score(model, c, d, scoring='roc_auc', cv=cv, n_jobs=-1)
    print(f'Random Forest Model\'s accuracy on test set is {100*model.score(X_test_all, y_test_all):.2f}%')
    print('Mean ROC AUC: %.3f' % mean(scores))
    #first report
    y_pred = model.predict(c)
    print('Report  Test / Test Predict')
    print(classification_report(d, y_pred))
     #second report
    y_pred_val = model.predict(e)
    print('Report  Validation / Test Validation')
    print(classification_report(f, y_pred_val))
    print('Processing time:',time.process_time() - start)
    return model

### ALL
#### 0 = Abnormal / 1 = Normal 

In [39]:
model_all=Balanced_RF(X_train_all, y_train_all, X_test_all, y_test_all, X_val_all, y_val_all)

Random Forest Model's accuracy on test set is 89.97%
Mean ROC AUC: 0.923
Report  Test / Test Predict
              precision    recall  f1-score   support

 Abnormal(0)       0.68      0.86      0.76      1655
   Normal(1)       0.97      0.91      0.94      7400

    accuracy                           0.90      9055
   macro avg       0.82      0.88      0.85      9055
weighted avg       0.91      0.90      0.90      9055

Report  Validation / Test Validation
              precision    recall  f1-score   support

           0       0.69      0.84      0.76       836
           1       0.96      0.91      0.94      3624

    accuracy                           0.90      4460
   macro avg       0.83      0.88      0.85      4460
weighted avg       0.91      0.90      0.90      4460

Processing time: 26.96875


### FAN

In [55]:
model_fan=Balanced_RF(X_train_fan, y_train_fan, X_test_fan, y_test_fan, X_val_fan, y_val_fan)

Random Forest Model's accuracy on test set is 48.60%
Mean ROC AUC: 0.944
Report  Test / Test Predict
              precision    recall  f1-score   support

           0       0.80      0.87      0.84       734
           1       0.95      0.92      0.94      2055

    accuracy                           0.91      2789
   macro avg       0.88      0.90      0.89      2789
weighted avg       0.91      0.91      0.91      2789

Report  Validation / Test Validation
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       369
           1       0.96      0.92      0.94      1005

    accuracy                           0.91      1374
   macro avg       0.88      0.91      0.89      1374
weighted avg       0.92      0.91      0.91      1374

Processing time: 10.1875


### VALVE

In [63]:
model_valve=Balanced_RF(X_train_valve, y_train_valve, X_test_valve, y_test_valve, X_val_valve, y_val_valve)


Random Forest Model's accuracy on test set is 36.63%
Mean ROC AUC: 0.901
Report  Test / Test Predict
              precision    recall  f1-score   support

           0       0.55      0.82      0.66       246
           1       0.97      0.91      0.94      1849

    accuracy                           0.90      2095
   macro avg       0.76      0.87      0.80      2095
weighted avg       0.92      0.90      0.91      2095

Report  Validation / Test Validation
              precision    recall  f1-score   support

           0       0.62      0.76      0.68       131
           1       0.96      0.93      0.95       902

    accuracy                           0.91      1033
   macro avg       0.79      0.85      0.82      1033
weighted avg       0.92      0.91      0.91      1033

Processing time: 4.375


### PUMP

In [64]:
model_pump=Balanced_RF(X_train_pump, y_train_pump, X_test_pump, y_test_pump, X_val_pump, y_val_pump)



Random Forest Model's accuracy on test set is 68.56%
Mean ROC AUC: 0.888
Report  Test / Test Predict
              precision    recall  f1-score   support

           0       0.44      0.80      0.57       248
           1       0.97      0.87      0.92      1865

    accuracy                           0.86      2113
   macro avg       0.71      0.83      0.74      2113
weighted avg       0.91      0.86      0.87      2113

Report  Validation / Test Validation
              precision    recall  f1-score   support

           0       0.47      0.80      0.59       123
           1       0.97      0.88      0.92       918

    accuracy                           0.87      1041
   macro avg       0.72      0.84      0.76      1041
weighted avg       0.91      0.87      0.88      1041

Processing time: 5.625


### SLIDER

In [65]:
model_slider=Balanced_RF(X_train_slider, y_train_slider, X_test_slider, y_test_slider, X_val_slider, y_val_slider)

Random Forest Model's accuracy on test set is 59.37%
Mean ROC AUC: 0.968
Report  Test / Test Predict
              precision    recall  f1-score   support

           0       0.83      0.90      0.86       469
           1       0.97      0.94      0.96      1588

    accuracy                           0.93      2057
   macro avg       0.90      0.92      0.91      2057
weighted avg       0.94      0.93      0.94      2057

Report  Validation / Test Validation
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       219
           1       0.97      0.93      0.95       795

    accuracy                           0.93      1014
   macro avg       0.88      0.92      0.90      1014
weighted avg       0.93      0.93      0.93      1014

Processing time: 7.15625


## Machine type pred

In [56]:
X_train_m, y_train_m, X_test_m, y_test_m, X_val_m, y_val_m=split(Fall, Machine)

In [61]:
def normal_RF(a, b, c, d, e, f):
    start = time.process_time()
    model = RandomForestClassifier(n_estimators = 50, criterion ='entropy', warm_start = True, max_features = 'sqrt', oob_score = 'True', random_state=42)
    #X_train, y_train
    model.fit(a, b)
    print(f'Random Forest Model\'s accuracy on training set is {100*model.score(a, b):.2f}%')
    print(f'Random Forest Model\'s accuracy on test set is {100*model.score(c, d):.2f}%')

    #first report
    y_pred = model.predict(c)
    print('Report  Test / Test Predict')
    print(classification_report(d, y_pred))
     #second report
    y_pred_val = model.predict(e)
    print('Report  Validation / Test Validation')
    print(classification_report(f, y_pred_val))
    print('Processing time:',time.process_time() - start)
    return model

In [62]:
model_machine=normal_RF(X_train_m, y_train_m, X_test_m, y_test_m, X_val_m, y_val_m)

Random Forest Model's accuracy on training set is 100.00%
Random Forest Model's accuracy on test set is 94.30%
Report  Test / Test Predict
              precision    recall  f1-score   support

           1       0.94      0.93      0.94      2806
           2       0.90      0.91      0.90      2106
           3       0.94      0.94      0.94      2112
           4       1.00      1.00      1.00      2031

    accuracy                           0.94      9055
   macro avg       0.94      0.94      0.94      9055
weighted avg       0.94      0.94      0.94      9055

Report  Validation / Test Validation
              precision    recall  f1-score   support

           1       0.94      0.93      0.93      1402
           2       0.90      0.91      0.91      1017
           3       0.95      0.94      0.94      1023
           4       1.00      1.00      1.00      1018

    accuracy                           0.94      4460
   macro avg       0.94      0.95      0.95      4460
weighted 

## Model export

In [67]:
joblib.dump(model_all, filename='model_all', compress=3)
joblib.dump(model_fan, filename='model_fan', compress=3)
joblib.dump(model_valve, filename='model_valve', compress=3)
joblib.dump(model_pump, filename='model_pump', compress=3)
joblib.dump(model_slider, filename='model_slider', compress=3)
joblib.dump(model_machine, filename='model_machine', compress=3)

['model_machine']