In [1]:
import sys
from pathlib import Path
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

ROOT = Path().resolve().parents[0]
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from src.paths import PROJECT_ROOT
from src.data_loader import load_processed_data
from src.config import load_config
from src.preprocessing import build_preprocessor
from src.pipeline_builder import build_pipeline
from src.train_and_predict import train_and_predict
from src.models import build_model
from src.evaluation import evaluate_classifier, print_results
from src.thresholding import sweep_thresholds, apply_threshold
from src.artifacts_io import save_artifacts

# Load config and load processed parquet data
cfg = load_config()
df = load_processed_data()

# Define model directory path
ARTIFACTS_DIR = PROJECT_ROOT / cfg['paths']['artifacts_dir']
MODEL_DIR = ARTIFACTS_DIR / "rf_tuned"

In [2]:
# Prepare train and test data
x_columns = cfg['features']['numerical'] + cfg['features']['categorical']
X = df[x_columns]
y = df['Machine failure']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=31, stratify=y, test_size=0.2)

# 1) Baseline Random Forest Model (default)

In [3]:
# Build baseline random forest model
# 1) Preprocess the data.
preprocessor = build_preprocessor(cfg, scale_numeric=False)
# 2) Build model
model = build_model('rf', random_state=0)
# 3) Build pipeline
pipeline = build_pipeline(preprocessor, model)
# 4) Train and predict
preds, proba = train_and_predict(pipeline, X_train, y_train, X_test)
# 5) Evaluate the model and print results
rf_default_results = evaluate_classifier(y_test, preds, proba)
print_results(rf_default_results)

tn                   | 1930
fp                   | 2
fn                   | 33
tp                   | 35
accuracy             | 0.9825
precision            | 0.9459
recall               | 0.5147
f1_score             | 0.6667
roc_auc              | 0.9616
pr_auc               | 0.7970


## Evaluate the model and find optimal threshold

In [4]:
# Evaluate model on different thresholds
rf_default_thresholds = sweep_thresholds(y_test, proba)
rf_default_thresholds[rf_default_thresholds["recall"] >= 0.70].sort_values("precision", ascending=False).tail(10)

Unnamed: 0,threshold,precision,recall,f1,tp,fp,fn,tn
9,0.09,0.378882,0.897059,0.532751,61,100,7,1832
8,0.08,0.362573,0.911765,0.518828,62,109,6,1823
7,0.07,0.336957,0.911765,0.492063,62,122,6,1810
6,0.06,0.305419,0.911765,0.457565,62,141,6,1791
5,0.05,0.273128,0.911765,0.420339,62,165,6,1767
4,0.04,0.251969,0.941176,0.397516,64,190,4,1742
3,0.03,0.222222,0.941176,0.359551,64,224,4,1708
2,0.02,0.172414,0.955882,0.292135,65,312,3,1620
1,0.01,0.116071,0.955882,0.207006,65,495,3,1437
0,0.0,0.034,1.0,0.065764,68,1932,0,0


In [5]:
rf_d_tuned_preds = apply_threshold(proba, 0.06)
rf_d_tuned_results = evaluate_classifier(y_test, rf_d_tuned_preds, proba)
print_results(rf_d_tuned_results)

tn                   | 1791
fp                   | 141
fn                   | 6
tp                   | 62
accuracy             | 0.9265
precision            | 0.3054
recall               | 0.9118
f1_score             | 0.4576
roc_auc              | 0.9616
pr_auc               | 0.7970


## Save model artifacts

In [6]:
# Save model artifacts

save_artifacts(
    model_dir=MODEL_DIR,
    model_name="rf_default_tuned",
    threshold=0.06,
    pipeline=pipeline,
    classifier_results=rf_d_tuned_results,
    proba_test=proba,
    threshold_sweep=rf_default_thresholds)

# 2) Baseline Random Forest model (balanced class weight)

In [7]:
# Build class balanced baseline random forest model 
# 1) Preprocess the data.
preprocessor = build_preprocessor(cfg, scale_numeric=False)
# 2) Build model (with 'class_weight')
model = build_model('rf', class_weight='balanced',  random_state=0)
# 3) Build pipeline
pipeline = build_pipeline(preprocessor, model)
# 4) Train and predict
preds, proba = train_and_predict(pipeline, X_train, y_train, X_test)
# 5) Evaluate the model and print results
rf_balanced_results = evaluate_classifier(y_test, preds, proba)
print_results(rf_balanced_results)

tn                   | 1928
fp                   | 4
fn                   | 40
tp                   | 28
accuracy             | 0.9780
precision            | 0.8750
recall               | 0.4118
f1_score             | 0.5600
roc_auc              | 0.9510
pr_auc               | 0.7658


## Evaluate the model and find optimal threshold

In [8]:
# Evaluate model on different thresholds
rf_b_thresholds = sweep_thresholds(y_test, proba)
rf_b_thresholds[rf_b_thresholds["recall"] >= 0.70].sort_values("precision", ascending=False).tail(10)

Unnamed: 0,threshold,precision,recall,f1,tp,fp,fn,tn
9,0.09,0.377483,0.838235,0.520548,57,94,11,1838
8,0.08,0.361963,0.867647,0.510823,59,104,9,1828
7,0.07,0.329609,0.867647,0.477733,59,120,9,1812
6,0.06,0.317949,0.911765,0.471483,62,133,6,1799
5,0.05,0.285068,0.926471,0.435986,63,158,5,1774
4,0.04,0.252,0.926471,0.396226,63,187,5,1745
3,0.03,0.212838,0.926471,0.346154,63,233,5,1699
2,0.02,0.174033,0.926471,0.293023,63,299,5,1633
1,0.01,0.116576,0.941176,0.207455,64,485,4,1447
0,0.0,0.034,1.0,0.065764,68,1932,0,0


In [9]:
rf_b_tuned_preds = apply_threshold(proba, 0.06)
rf_b_tuned_results = evaluate_classifier(y_test, rf_b_tuned_preds, proba)
print_results(rf_b_tuned_results)

tn                   | 1799
fp                   | 133
fn                   | 6
tp                   | 62
accuracy             | 0.9305
precision            | 0.3179
recall               | 0.9118
f1_score             | 0.4715
roc_auc              | 0.9510
pr_auc               | 0.7658


# Build comparison table and find best baseline model

In [10]:
# Build comparison table
comparison = pd.DataFrame([
    {
        "model": "RandomForest default",
        "threshold": 0.5,
        **rf_default_results
    },
    {
        "model": "RandomForest default (tuned)",
        "threshold": 0.06,
        **rf_d_tuned_results
    },
    {
        "model": "RandomForest balanced",
        "threshold": 0.5,
        **rf_balanced_results   
    },
    {
       "model": "RandomForest balanced (tuned)",
        "threshold": 0.6,
        **rf_b_tuned_results
    }
])

In [11]:
# Build comparison table
comparison

Unnamed: 0,model,threshold,tn,fp,fn,tp,accuracy,precision,recall,f1_score,roc_auc,pr_auc
0,RandomForest default,0.5,1930,2,33,35,0.9825,0.945946,0.514706,0.666667,0.961553,0.796959
1,RandomForest default (tuned),0.06,1791,141,6,62,0.9265,0.305419,0.911765,0.457565,0.961553,0.796959
2,RandomForest balanced,0.5,1928,4,40,28,0.978,0.875,0.411765,0.56,0.951003,0.76583
3,RandomForest balanced (tuned),0.6,1799,133,6,62,0.9305,0.317949,0.911765,0.471483,0.951003,0.76583


## Random Forest Model — Candidate Assessment

The Random Forest model demonstrates a substantial improvement over all linear baselines and establishes itself as a strong candidate for predictive maintenance applications.

After decision threshold tuning, the model operates at an effective and well-balanced operating point, achieving:

- **High failure detection rate** (recall ≈ 91%), ensuring that the majority of machine failures are identified.
- **Significantly improved precision** (≈ 30%), reducing false alerts compared to baseline models.
- **Low number of false negatives**, which is critical in maintenance-driven decision-making.
- **Strong ranking capability**, as reflected by high ROC-AUC and PR-AUC scores, indicating reliable risk ordering across thresholds.

These results confirm that the model successfully captures non-linear patterns and feature interactions present in the data, while allowing explicit control over alerting behavior through decision threshold selection.

---

### Model Readiness Perspective

At this stage, the Random Forest model satisfies key technical requirements expected from a production-grade candidate:

- Decision threshold tuning aligns model behavior with business priorities.
- Precision–recall trade-offs are well understood and explicitly quantified.
- The alerting policy is stable and interpretable.
- The model substantially outperforms all linear baselines under identical evaluation criteria.

However, final model selection is deferred pending evaluation of more advanced ensemble methods.

---

### Next Steps

While Random Forest already provides strong performance, an additional **Gradient Boosting** model will be evaluated to determine whether further improvements can be achieved.

Gradient Boosting is expected to:
- iteratively focus on hard-to-classify failure cases,
- improve precision–recall trade-offs under severe class imbalance,
- potentially reduce false positives while maintaining high recall.

The Gradient Boosting model will be assessed using the same evaluation framework and decision thresholding strategy to enable a fair and consistent comparison.