# Random Forest Classifier

## Configuration

In [1]:
from ts_clf_event.data_handler.utils import split_data_time_based
from ts_clf_event.model.utils import display_cv_results
from ts_clf_event.model.model import ModelPipeline
from ts_clf_event.model.evaluator import Evaluator

## Time-based split: Create a "gold" ground truth

In [2]:
data_path = "/Users/georgebatsis/Documents/Projects/ts_clf_event/data/test_dataframe.csv"
test_size_percent = 0.2
label_col = "process"

dev_df, test_df = split_data_time_based(data_path, test_size_percent, label_col)

Number of data points in train set: 69549
Number of data points in test set: 17387
Class distribution in train set: process
0    66363
1     3186
Name: count, dtype: int64
Class distribution in test set: process
0    16334
1     1053
Name: count, dtype: int64
Time in train set: 2020-06-01 00:00:00 to 2020-06-25 12:03:55
Time in test set: 2020-06-25 12:04:56 to 2020-07-01 23:59:37


## Model pipeline

In [3]:
windows = "auto"
features_to_roll = ["value", "level", "frequency", "speed"]
diff_lags = [1, 2]

x_train = dev_df.drop("process", axis=1)
y_train = dev_df["process"]

model = ModelPipeline(
    windows=windows,
    features_to_roll=features_to_roll,
    diff_lags=diff_lags,
    features_to_diff=features_to_roll,
    groupby_col="provider",
)

### K-Fold cross validation

In [4]:
cv_results = model.cross_validate(x_train, y_train)

Train set: process
0    10975
1      619
Name: count, dtype: int64
Test set: process
0    10675
1      916
Name: count, dtype: int64
Train set: process
0    21650
1     1535
Name: count, dtype: int64
Test set: process
0    11243
1      348
Name: count, dtype: int64
Train set: process
0    32893
1     1883
Name: count, dtype: int64
Test set: process
0    10995
1      596
Name: count, dtype: int64
Train set: process
0    43888
1     2479
Name: count, dtype: int64
Test set: process
0    11314
1      277
Name: count, dtype: int64
Train set: process
0    55202
1     2756
Name: count, dtype: int64
Test set: process
0    11161
1      430
Name: count, dtype: int64


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.


Windows for 1: [60, 90, 120]
Windows for 2: [60, 90, 120]
Windows for 1: [60, 90, 120]
Windows for 2: [60, 90, 120]
Windows for 1: [60, 90, 120]
Windows for 2: [60, 90, 120]
Windows for 1: [60, 90, 120]
Windows for 2: [60, 90, 120]
Windows for 1: [60, 90, 120]
Windows for 2: [60, 90, 120]
[CV] END  f1: (test=0.980) f1_pos: (test=0.868) precision: (test=0.980) precision_pos: (test=0.946) recall: (test=0.981) recall_pos: (test=0.802) total time=   0.7s
[CV] END  f1: (test=0.999) f1_pos: (test=0.981) precision: (test=0.999) precision_pos: (test=1.000) recall: (test=0.999) recall_pos: (test=0.963) total time=   1.2s


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.6s remaining:    3.9s


[CV] END  f1: (test=0.987) f1_pos: (test=0.865) precision: (test=0.987) precision_pos: (test=0.963) recall: (test=0.987) recall_pos: (test=0.785) total time=   1.6s
[CV] END  f1: (test=0.997) f1_pos: (test=0.928) precision: (test=0.997) precision_pos: (test=1.000) recall: (test=0.997) recall_pos: (test=0.866) total time=   2.1s
[CV] END  f1: (test=0.989) f1_pos: (test=0.851) precision: (test=0.990) precision_pos: (test=0.949) recall: (test=0.990) recall_pos: (test=0.772) total time=   2.6s


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.0s finished


In [5]:
display_cv_results(cv_results)

Metric          Mean       Std       
-----------------------------------
precision       0.9906     0.0067    
recall          0.9908     0.0065    
f1              0.9904     0.0068    
precision_pos   0.9715     0.0240    
recall_pos      0.8378     0.0703    
f1_pos          0.8988     0.0489    


In [6]:
x_test = test_df.drop("process", axis=1)
y_test = test_df["process"]

### Final model training (Use all set)

In [7]:
model.train(x_train, y_train)

# Save the model
model.save_model("RF_model")

Windows for 1: [60, 90, 120]
Windows for 2: [60, 90, 120]


In [8]:
model.load_model("RF_model")
y_pred_prob = model.predict_proba(x_test)

In [9]:
test_res = Evaluator().report_metrics(y_test, y_pred_prob[:, 1], threshold=0.5)

Confusion Matrix:
                  Predicted Negative  Predicted Positive
Actual Negative               16268                  66
Actual Positive                 152                 901
Macro-averaged Metrics:
  Precision (Macro):   0.9612453347810259
  Recall (Macro):   0.9258049354576027
  F1-Score (Macro):   0.9427117661390613
 Positive Class Metrics:
  Precision ( Positive):   0.9317476732161324
  Recall ( Positive):   0.855650522317189
  F1-Score ( Positive):   0.8920792079207921
Imbalance-Aware Metrics:
  Matthews Correlation Coefficient (MCC):   0.8863420107533099
  Balanced Accuracy:   0.9258049354576027
  Average Precision (AP):   0.9728225005649416
  Area Under ROC Curve (AUROC):   0.9980681932745114
  Recall@Precision=0.5:   1.0
