In [1]:
import sys
from pathlib import Path
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

ROOT = Path().resolve().parents[0]
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from src.paths import PROJECT_ROOT
from src.data_loader import load_processed_data
from src.config import load_config
from src.preprocessing import build_preprocessor
from src.pipeline_builder import build_pipeline
from src.train_and_predict import train_and_predict
from src.models import build_model
from src.evaluation import evaluate_classifier, print_results
from src.thresholding import sweep_thresholds, apply_threshold
from src.artifacts_io import save_artifacts, save_split

# Load config and load processed parquet data
cfg = load_config()
df = load_processed_data()

# Define split directory and artifacts directory
SPLIT_DIR = PROJECT_ROOT / cfg['paths']['split_dir']
ARTIFACTS_DIR = PROJECT_ROOT / cfg['paths']['artifacts_dir']

In [2]:
# Prepare train and test data
x_columns = cfg['features']['numerical'] + cfg['features']['categorical']
X = df[x_columns]
y = df['Machine failure']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=31, stratify=y, test_size=0.2)

In [3]:
test_idx = X_test.index.values
save_split(SPLIT_DIR,
            test_idx=test_idx,
            y_test=y_test.values,
            meta={
                "random_state": 31,
                "test_size": 0.2,
                "stratified": True
})

# 1) Baseline model

In [4]:
# Build baseline logistic regression model
# 1) Preprocess the data. 'scale_numeric' flag required for lorgeg model only
preprocessor = build_preprocessor(cfg, scale_numeric=True)
# 2) Build model (without 'class_weight' parameter first)
model = build_model('logreg', random_state=0, max_iter=2000)
# 3) Build pipeline
pipeline = build_pipeline(preprocessor, model)
# 4) Train and predict
preds, proba = train_and_predict(pipeline, X_train, y_train, X_test)
# 5) Evaluate the model and print results
lr_default_results = evaluate_classifier(y_test, preds, proba)
print_results(lr_default_results)

tn                   | 1930
fp                   | 2
fn                   | 59
tp                   | 9
accuracy             | 0.9695
precision            | 0.8182
recall               | 0.1324
f1_score             | 0.2278
roc_auc              | 0.8889
pr_auc               | 0.4556


## Evaluate the model and find optimal threshold

In [5]:
# Evaluate model on different thresholds
lr_default_thresholds = sweep_thresholds(y_test, proba)
lr_default_thresholds[lr_default_thresholds["recall"] >= 0.70].sort_values("precision", ascending=False).head(10)

Unnamed: 0,threshold,precision,recall,f1,tp,fp,fn,tn
4,0.04,0.139276,0.735294,0.234192,50,309,18,1623
3,0.03,0.12,0.75,0.206897,51,374,17,1558
2,0.02,0.105647,0.852941,0.188006,58,491,10,1441
1,0.01,0.076549,0.926471,0.141414,63,760,5,1172
0,0.0,0.034,1.0,0.065764,68,1932,0,0


In [6]:
lr_d_tuned_preds = apply_threshold(proba, 0.02)
lr_d_tuned_results = evaluate_classifier(y_test, lr_d_tuned_preds, proba)
print_results(lr_d_tuned_results)

tn                   | 1441
fp                   | 491
fn                   | 10
tp                   | 58
accuracy             | 0.7495
precision            | 0.1056
recall               | 0.8529
f1_score             | 0.1880
roc_auc              | 0.8889
pr_auc               | 0.4556


## Save model artifacts

In [7]:
# Save model artifacts
lr_d_model_dir = ARTIFACTS_DIR / "lr_default_tuned"

save_artifacts(
    model_dir=lr_d_model_dir,
    model_name="lr_default_tuned",
    threshold=0.02,
    pipeline=pipeline,
    classifier_results=lr_d_tuned_results,
    proba_test=proba,
    threshold_sweep=lr_default_thresholds)

# 2) Baseline model (balanced class weight)

In [8]:
# Build class balanced baseline logistic regression model 
# 1) Preprocess the data. 'scale_numeric' flag required for lorgeg model only
preprocessor = build_preprocessor(cfg, scale_numeric=True)
# 2) Build model (with 'class_weight')
model = build_model('logreg', class_weight='balanced',  random_state=0, max_iter=2000)
# 3) Build pipeline
pipeline = build_pipeline(preprocessor, model)
# 4) Train and predict
preds, proba = train_and_predict(pipeline, X_train, y_train, X_test)
# 5) Evaluate the model and print results
lr_balanced_results = evaluate_classifier(y_test, preds, proba)
print_results(lr_balanced_results)

tn                   | 1581
fp                   | 351
fn                   | 15
tp                   | 53
accuracy             | 0.8170
precision            | 0.1312
recall               | 0.7794
f1_score             | 0.2246
roc_auc              | 0.8894
pr_auc               | 0.3955


## Evaluate the model and find optimal threshold

In [9]:
# Evaluate model on different thresholds
lr_balanced_thresholds = sweep_thresholds(y_test, proba)
lr_balanced_thresholds[lr_balanced_thresholds["recall"] >= 0.70].sort_values("precision", ascending=False).head(10)

Unnamed: 0,threshold,precision,recall,f1,tp,fp,fn,tn
61,0.61,0.166667,0.705882,0.269663,48,240,20,1692
59,0.59,0.165049,0.75,0.270557,51,258,17,1674
60,0.6,0.162162,0.705882,0.263736,48,248,20,1684
58,0.58,0.157895,0.75,0.26087,51,272,17,1660
57,0.57,0.153153,0.75,0.254364,51,282,17,1650
56,0.56,0.14956,0.75,0.249389,51,290,17,1642
55,0.55,0.144068,0.75,0.241706,51,303,17,1629
54,0.54,0.140884,0.75,0.237209,51,311,17,1621
53,0.53,0.138965,0.75,0.234483,51,316,17,1616
52,0.52,0.133858,0.75,0.227171,51,330,17,1602


In [10]:
lr_b_tuned_preds = apply_threshold(proba, 0.58)
lr_b_tuned_results = evaluate_classifier(y_test, lr_b_tuned_preds, proba)
print_results(lr_b_tuned_results)

tn                   | 1660
fp                   | 272
fn                   | 17
tp                   | 51
accuracy             | 0.8555
precision            | 0.1579
recall               | 0.7500
f1_score             | 0.2609
roc_auc              | 0.8894
pr_auc               | 0.3955


## Save model artifacts

In [11]:
# Save model artifacts
lr_b_model_dir = ARTIFACTS_DIR / "lr_balanced_tuned"

save_artifacts(
    model_dir=lr_b_model_dir,
    model_name="lr_balanced_tuned",
    threshold=0.58,
    pipeline=pipeline,
    classifier_results=lr_b_tuned_results,
    proba_test=proba,
    threshold_sweep=lr_balanced_thresholds)

# Build comparison table and find best baseline model

In [12]:
# Build comparison table
comparison = pd.DataFrame([
    {
        "model": "LogReg (default)",
        "threshold": 0.5,
        **lr_default_results
    },
    {
        "model": "LogReg (tuned)",
        "threshold": 0.02,
        **lr_d_tuned_results
    },
    {
        "model": "LogReg balanced (default)",
        "threshold": 0.5,
        **lr_balanced_results   
    },
    {
       "model": "LogReg balanced (tuned)",
        "threshold": 0.58,
        **lr_b_tuned_results
    }
])

In [13]:
# Build comparison table
comparison

Unnamed: 0,model,threshold,tn,fp,fn,tp,accuracy,precision,recall,f1_score,roc_auc,pr_auc
0,LogReg (default),0.5,1930,2,59,9,0.9695,0.818182,0.132353,0.227848,0.888922,0.455572
1,LogReg (tuned),0.02,1441,491,10,58,0.7495,0.105647,0.852941,0.188006,0.888922,0.455572
2,LogReg balanced (default),0.5,1581,351,15,53,0.817,0.131188,0.779412,0.224576,0.889394,0.395515
3,LogReg balanced (tuned),0.58,1660,272,17,51,0.8555,0.157895,0.75,0.26087,0.889394,0.395515


# ⭐️Interpretation and Key Takeaways on Logistic Regression Models
A decision threshold of **0.58** was selected for the Logistic Regression model with **class weighting (`class_weight='balanced'`)**, based on the predictive maintenance objective of prioritizing failure detection while reducing unnecessary alerts.

At this threshold:
- approximately **75% of real failures are detected** (high recall),
- precision is around **16%**, meaning that roughly one out of six alerts corresponds to an actual failure.

This threshold represents a more efficient trade-off compared to the unweighted baseline: a high level of failure coverage is maintained while substantially reducing the number of false alerts. The selected operating point demonstrates that class weighting enables a more favorable balance between recall and precision under severe class imbalance.

## Comparison of Threshold Strategies

Two threshold strategies were evaluated for Logistic Regression models under the same predictive maintenance objective of prioritizing failure detection.

## Unweighted Baseline (Threshold = 0.02)

- **Recall**: ~85%  
- **Precision**: ~10%  
- **Failure detection policy**: Highly aggressive  
- **Alert volume**: High (large number of false positives)

This strategy maximizes failure coverage but generates a substantial number of false alerts. It confirms the presence of a predictive signal but results in a noisy alerting system.

---

## Balanced Baseline (Threshold = 0.58)

- **Recall**: ~75%  
- **Precision**: ~16%  
- **Failure detection policy**: Balanced  
- **Alert volume**: Significantly reduced compared to the unweighted baseline

This strategy maintains high failure detection while substantially reducing false alerts, resulting in a more efficient and operationally realistic alerting policy.

---

## Final Baseline Selection

The **class-balanced Logistic Regression model with threshold = 0.58** is selected as the stronger baseline.

Although the unweighted model achieves higher recall, the balanced model provides a superior precision–recall trade-off, detecting the majority of failures while nearly halving the number of false alerts. This makes it a more suitable reference point for evaluating more complex models.

This baseline establishes a realistic and defensible operating policy for subsequent comparison with tree-based models.