# Modelisation

In [13]:
import joblib
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from src.transform import transform_maintenance_data  # Your transformation function
import os
from typing import Tuple, Any, Dict
import re

In [14]:
# Prepare Data Function
def prepare_data(data_path: str, target: str, test_size: float = 0.2, random_state: int = 42,
                 split_filepath: str = None) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """
    Transforms the data, then either splits it or loads a pre-saved train-test split.
    If split_filepath is provided, it will attempt to load the split from there.
    """
    df = pd.read_csv(data_path, index_col="UDI")
    df = df.drop(columns=["Product ID", "HDF","TWF","PWF","OSF","RNF"])
    # Apply transformations to the dataset
    df = transform_maintenance_data(df=df, speed_column="Rotational speed [rpm]",
                                    torque_column="Torque [Nm]", col1="Process temperature [K]",
                                    col2="Air temperature [K]", result_col="temp_diff [K]")
    df=pd.get_dummies(df)
    df = df.rename(columns={
    'Air temperature [K]': 'Air temperature',
    'Process temperature [K]': 'Process temperature',
    'Rotational speed [rpm]': 'Rotational speed',
    'Torque [Nm]': 'Torque',
    'Tool wear [min]': 'Tool wear',
    'mechanical_power [W]': 'Mechanical power',
    'temp_diff [K]': 'Temp diff'
    })
    print(df)
    if split_filepath and os.path.exists(split_filepath):
        # Load pre-saved train-test split
        X_train, X_test, y_train, y_test = joblib.load(split_filepath)
    else:
        # Perform train-test split
        X = df.drop(columns=[target])
        y = df[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
        
        # Optionally save the split
        if split_filepath:
            joblib.dump((X_train, X_test, y_train, y_test), split_filepath)
    
    return X_train, X_test, y_train, y_test

# Model fitting function with HalvingGridSearchCV
def fit_model(X_train: pd.DataFrame, y_train: pd.Series) -> Any:
    """
    Trains the model using HalvingGridSearchCV and returns the best model.
    """
    lgbm = LGBMClassifier(force_col_wise=True, verbose=-1, class_weight='balanced')

    param_grid = {
        'num_leaves': [31, 50],
        'max_depth': [10, 15],
        'learning_rate': [0.001, 0.01, 0.05, 0.1],
        'n_estimators': [50,100],
        'subsample': [0.8, 0.9, 1.0],
        'reg_alpha': [0.0, 0.1, 0.5, 1.0],
        'reg_lambda': [0.0, 0.1, 0.5, 1.0]
    }

    halving_cv = HalvingGridSearchCV(
        estimator=lgbm,
        param_grid=param_grid,
        factor=3,
        random_state=42,
        scoring='f1_macro',
        cv=StratifiedKFold(n_splits=3),
        verbose=0
    )
    print(X_train)
    halving_cv.fit(X_train, y_train)
    
    return halving_cv.best_estimator_

# Model evaluation function
def evaluate_model(model: Any, X_test: pd.DataFrame, y_test: pd.Series) -> Dict[str, Any]:
    """
    Evaluates the model and returns a dictionary of metrics.
    """
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    metrics = {
        'roc_auc': roc_auc_score(y_test, y_proba),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred)
    }
    
    return metrics

# Function to save the model and the train-test split
def save_model_and_split(model: Any, model_filepath: str, X_train, X_test, y_train, y_test, split_filepath: str) -> None:
    """
    Saves the trained model and the train-test split.
    """
    joblib.dump(model, model_filepath)
    joblib.dump((X_train, X_test, y_train, y_test), split_filepath)


In [15]:
# Set file paths
data_path = 'data/data.csv'
model_filepath = "models/best_model.pkl"
split_filepath = "models/train_test_split.pkl"

# Prepare data
X_train, X_test, y_train, y_test = prepare_data(data_path=data_path,target="Machine failure", test_size=0.2, random_state=42, split_filepath=split_filepath)


       Air temperature  Process temperature  Rotational speed  Torque  \
UDI                                                                     
1                298.1                308.6              1551    42.8   
2                298.2                308.7              1408    46.3   
3                298.1                308.5              1498    49.4   
4                298.2                308.6              1433    39.5   
5                298.2                308.7              1408    40.0   
...                ...                  ...               ...     ...   
9996             298.8                308.4              1604    29.5   
9997             298.9                308.4              1632    31.8   
9998             299.0                308.6              1645    33.4   
9999             299.0                308.7              1408    48.5   
10000            299.0                308.7              1500    40.2   

       Tool wear  Machine failure  Mechanical powe

In [16]:
X_train

Unnamed: 0_level_0,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Mechanical power,Temp diff,Type_H,Type_L,Type_M
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4059,302.0,310.9,1456,47.2,54,7196.676675,8.9,False,False,True
1222,297.0,308.3,1399,46.4,132,6797.736296,11.3,False,False,True
6896,301.0,311.6,1357,45.6,137,6479.974671,10.6,False,False,True
9864,298.9,309.8,1411,56.3,84,8318.864043,10.9,False,True,False
8712,297.1,308.5,1733,28.7,50,5208.456932,11.4,False,True,False
...,...,...,...,...,...,...,...,...,...,...
981,296.1,306.7,1409,42.8,134,6315.145776,10.6,False,True,False
4267,302.7,311.1,1440,39.5,146,5956.459671,8.4,False,True,False
7773,300.3,311.5,1464,41.0,29,6285.698581,11.2,True,False,False
5781,301.7,311.2,1517,42.4,113,6735.658425,9.5,False,True,False


In [17]:
model = fit_model(X_train, y_train)

      Air temperature  Process temperature  Rotational speed  Torque  \
UDI                                                                    
4059            302.0                310.9              1456    47.2   
1222            297.0                308.3              1399    46.4   
6896            301.0                311.6              1357    45.6   
9864            298.9                309.8              1411    56.3   
8712            297.1                308.5              1733    28.7   
...               ...                  ...               ...     ...   
981             296.1                306.7              1409    42.8   
4267            302.7                311.1              1440    39.5   
7773            300.3                311.5              1464    41.0   
5781            301.7                311.2              1517    42.4   
1425            298.7                309.7              1462    46.8   

      Tool wear  Mechanical power  Temp diff  Type_H  Type_L  T

In [18]:
# Evaluate model
metrics = evaluate_model(model, X_test, y_test)

# Display metrics
print(f"ROC AUC: {metrics['roc_auc']}")
print(f"Recall: {metrics['recall']}")
print(f"F1 Score: {metrics['f1']}")
print(f"Confusion Matrix: \n{metrics['confusion_matrix']}")
print(f"Classification Report: \n{metrics['classification_report']}")

# Save model and train-test split
save_model_and_split(model, model_filepath, X_train, X_test, y_train, y_test, split_filepath)

# Confirm the model and split are saved
print(f"Model saved to {model_filepath}")
print(f"Train-test split saved to {split_filepath}")

ROC AUC: 0.9723313238338813
Recall: 0.8676470588235294
F1 Score: 0.7468354430379747
Confusion Matrix: 
[[1901   31]
 [   9   59]]
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1932
           1       0.66      0.87      0.75        68

    accuracy                           0.98      2000
   macro avg       0.83      0.93      0.87      2000
weighted avg       0.98      0.98      0.98      2000

Model saved to models/best_model.pkl
Train-test split saved to models/train_test_split.pkl
