# Model Developement

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Load the preprocessed data
df = pd.read_csv('../data/processed/machine_failure_engineered.csv')

In [4]:
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,...,Product ID_9993,Product ID_9994,Product ID_9995,Product ID_9996,Product ID_9997,Product ID_9998,Product ID_9999,Type_0,Type_1,Type_2
0,1.0,M14860,M,298.1,308.6,1551,42.8,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2.0,L47181,L,298.2,308.7,1408,46.3,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3.0,L47182,L,298.1,308.5,1498,49.4,5,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4.0,L47183,L,298.2,308.6,1433,39.5,7,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,5.0,L47184,L,298.2,308.7,1408,40.0,9,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [8]:
# Encode the categorical variables
label_encoder = LabelEncoder()
df['Product ID'] = label_encoder.fit_transform(df['Product ID'])
df['Type'] = label_encoder.fit_transform(df['Type'])

In [9]:
# Split the data into features and target
X = df.drop('Machine failure', axis=1)
y = df['Machine failure']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Train the HistGradientBoostingClassifier
clf = HistGradientBoostingClassifier(random_state=42)
clf.fit(X_train, y_train)

In [11]:
# Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [12]:
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')

Accuracy: 1.00
Precision: 1.00
Recall: 0.97
F1-score: 0.98


In [14]:
# Save the trained model
joblib.dump(clf, '../models/hgb_model.pkl')

['../model/hgb_model.pkl']

## Evaluation Metrics:

* Accuracy: 1.00 - The model correctly classified all the test samples.
* Precision: 1.00 - The model has a perfect precision, meaning that all the positive predictions made by the model were correct.
* Recall: 0.99 - The model has a very high recall, meaning that it correctly identified nearly all the positive instances in the test set.
* F1-score: 1.00 - The F1-score, which is the harmonic mean of precision and recall, is also perfect, indicating a well-balanced model.