# Random Forest Classifier

## Configuration

In [20]:
from ts_clf_event.data_handler.utils import split_data_time_based
from ts_clf_event.model.utils import display_cv_results
from ts_clf_event.model.model import ModelPipeline
from ts_clf_event.model.evaluator import Evaluator

## Time-based split: Create a "gold" ground truth

In [21]:
data_path = "/Users/georgebatsis/Documents/Projects/ts_clf_event/data/test_dataframe.csv"
test_size_percent = 0.2
label_col = "process"

dev_df, test_df = split_data_time_based(data_path, test_size_percent, label_col)

Number of data points in train set: 69549
Number of data points in test set: 17387
Class distribution in train set: process
0    66363
1     3186
Name: count, dtype: int64
Class distribution in test set: process
0    16334
1     1053
Name: count, dtype: int64
Time in train set: 2020-06-01 00:00:00 to 2020-06-25 12:03:55
Time in test set: 2020-06-25 12:04:56 to 2020-07-01 23:59:37


## Model pipeline

In [22]:
windows = [30, 40, 60]
features_to_roll = ["value", "level", "frequency", "speed"]
diff_lags = [1, 2]

x_train = dev_df.drop("process", axis=1)
y_train = dev_df["process"]

model = ModelPipeline(
    windows=windows,
    features_to_roll=features_to_roll,
    diff_lags=diff_lags,
    features_to_diff=features_to_roll,
    groupby_col="provider",
)

### K-Fold cross validation

In [23]:
cv_results = model.cross_validate(x_train, y_train)

Train set: process
0    10975
1      619
Name: count, dtype: int64
Test set: process
0    10675
1      916
Name: count, dtype: int64
Train set: process
0    21650
1     1535
Name: count, dtype: int64
Test set: process
0    11243
1      348
Name: count, dtype: int64
Train set: process
0    32893
1     1883
Name: count, dtype: int64
Test set: process
0    10995
1      596
Name: count, dtype: int64
Train set: process
0    43888
1     2479
Name: count, dtype: int64
Test set: process
0    11314
1      277
Name: count, dtype: int64
Train set: process
0    55202
1     2756
Name: count, dtype: int64
Test set: process
0    11161
1      430
Name: count, dtype: int64


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.


[CV] END  f1: (test=0.976) f1_pos: (test=0.837) precision: (test=0.977) precision_pos: (test=0.964) recall: (test=0.977) recall_pos: (test=0.739) total time=   0.6s
[CV] END  f1: (test=0.997) f1_pos: (test=0.958) precision: (test=0.997) precision_pos: (test=0.939) recall: (test=0.997) recall_pos: (test=0.977) total time=   1.1s


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.9s remaining:    2.8s


[CV] END  f1: (test=0.991) f1_pos: (test=0.913) precision: (test=0.992) precision_pos: (test=0.994) recall: (test=0.992) recall_pos: (test=0.844) total time=   1.5s
[CV] END  f1: (test=0.994) f1_pos: (test=0.912) precision: (test=0.994) precision_pos: (test=0.943) recall: (test=0.994) recall_pos: (test=0.884) total time=   2.4s
[CV] END  f1: (test=1.000) f1_pos: (test=0.998) precision: (test=1.000) precision_pos: (test=1.000) recall: (test=1.000) recall_pos: (test=0.996) total time=   2.0s


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.8s finished


In [24]:
display_cv_results(cv_results)

Metric          Mean       Std       
-----------------------------------
precision       0.9919     0.0080    
recall          0.9920     0.0079    
f1              0.9916     0.0084    
precision_pos   0.9681     0.0252    
recall_pos      0.8880     0.0936    
f1_pos          0.9236     0.0538    


In [25]:
x_test = test_df.drop("process", axis=1)
y_test = test_df["process"]

### Final model training (Use all set)

In [26]:
model.train(x_train, y_train)

# Save the model
model.save_model("RF_model")

In [27]:
model.load_model("RF_model")
y_pred_prob = model.predict_proba(x_test)

In [28]:
test_res = Evaluator().report_metrics(y_test, y_pred_prob[:, 1], threshold=0.5)

Confusion Matrix:
                  Predicted Negative  Predicted Positive
Actual Negative               16272                  62
Actual Positive                 129                 924
Macro-averaged Metrics:
  Precision (Macro):   0.9646271506969162
  Recall (Macro):   0.9368485570273253
  F1-Score (Macro):   0.9502459486178073
 Positive Class Metrics:
  Precision ( Positive):   0.9371196754563894
  Recall ( Positive):   0.8774928774928775
  F1-Score ( Positive):   0.9063266307013241
Imbalance-Aware Metrics:
  Matthews Correlation Coefficient (MCC):   0.9010476132539621
  Balanced Accuracy:   0.9368485570273253
  Average Precision (AP):   0.9829368893435938
  Area Under ROC Curve (AUROC):   0.9988904749628802
  Recall@Precision=0.5:   1.0
