# Modelisation

In [55]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from src.transform import transform_maintenance_data
from sklearn.experimental import enable_halving_search_cv  # Required to enable this feature
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import StratifiedKFold

In [56]:
df = pd.read_csv('data/data.csv',index_col="UDI")
df = transform_maintenance_data(df=df, speed_column="Rotational speed [rpm]",
                                    torque_column="Torque [Nm]", col1="Process temperature [K]",
                                    col2="Air temperature [K]", result_col="temp_diff [K]")

In [57]:
df.head()

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,mechanical_power [W],temp_diff [K]
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0,6951.59056,10.5
2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0,6826.722724,10.5
3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0,7749.387543,10.4
4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0,5927.504659,10.4
5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0,5897.816608,10.5


In [58]:
df.columns.values

array(['Product ID', 'Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF',
       'RNF', 'mechanical_power [W]', 'temp_diff [K]'], dtype=object)

In [59]:
x_variables = ['Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]','mechanical_power [W]', 'temp_diff [K]']
y_variables = ['Machine failure']
X = pd.get_dummies(df[x_variables]).to_numpy()
y= df[y_variables].to_numpy().ravel()

In [60]:
X

array([[298.1, 308.6, 1551, ..., False, False, True],
       [298.2, 308.7, 1408, ..., False, True, False],
       [298.1, 308.5, 1498, ..., False, True, False],
       ...,
       [299.0, 308.6, 1645, ..., False, False, True],
       [299.0, 308.7, 1408, ..., True, False, False],
       [299.0, 308.7, 1500, ..., False, False, True]], dtype=object)

In [61]:
y

array([0, 0, 0, ..., 0, 0, 0])

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)

In [63]:
# Initialize the LightGBM Classifier with class_weight set to balanced
lgbm = lgb.LGBMClassifier(force_col_wise=True, verbose=-1, class_weight='balanced')

# Define an even larger parameter grid with regularization terms
param_grid = {
    'num_leaves': [31, 50, 70, 100],
    'max_depth': [10, 20, 30],
    'learning_rate': [0.001, 0.01, 0.05, 0.1],
    'n_estimators': [100, 200],
    'min_child_samples': [10, 20],
    'subsample': [0.8, 0.9, 1.0],
    'reg_alpha': [0.0, 0.1, 0.5, 1.0],  # L1 regularization (alpha)
    'reg_lambda': [0.0, 0.1, 0.5, 1.0]  # L2 regularization (lambda)
}

# Set up the HalvingGridSearchCV with the updated parameter grid
halving_cv = HalvingGridSearchCV(
    estimator=lgbm,
    param_grid=param_grid,
    factor=3,  # Resource reduction factor
    random_state=42,
    scoring='recall',  # Use recall as the evaluation metric
    cv=StratifiedKFold(n_splits=5),  # Stratified cross-validation
    verbose=0
)

# Fit the HalvingGridSearchCV on the training data
halving_cv.fit(X_train, y_train)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Traceback (most recent call last):
  File "/home/gabriel/Bureau/predictive_maintenance/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gabriel/Bureau/predictive_maintenance/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/gabriel/Bureau/predictive_maintenance/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/home/gabriel/Bureau/predictive_maintenance/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", li

In [65]:
halving_cv.best_score_

np.float64(0.8320000000000001)