In [1]:
from google.colab import files
uploaded = files.upload()

import pandas as pd
df = pd.read_csv('/content/cleaned_predictive_maintenance_data.csv')

print(df.head())

Saving cleaned_predictive_maintenance_data.csv to cleaned_predictive_maintenance_data.csv
  device_id device_type inspection_date  usage_hours  avg_temperature  \
0     D0001   Retractor      2024-05-21          910            38.75   
1     D0002       Drill      2024-06-12         3822            48.51   
2     D0003       Drill      2023-09-22         3142            49.93   
3     D0004   Retractor      2024-09-17          516            36.81   
4     D0005         Saw      2023-10-19         4476            58.04   

   vibration_level  error_logs_count  pressure_variation  last_maintenance  \
0            0.907                 0                1.43               136   
1            0.284                 3                0.87               188   
2            0.984                 1                1.23                 1   
3            0.149                 2                0.43               105   
4            0.210                 0                1.20               363   

  

In [3]:
from sklearn.model_selection import train_test_split

X = df[['usage_hours', 'avg_temperature', 'vibration_level', 'error_logs_count', 'pressure_variation', 'last_maintenance']]
y = df['failure']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

lr_model = LogisticRegression()

lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.94
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97       181
           1       0.82      0.47      0.60        19

    accuracy                           0.94       200
   macro avg       0.88      0.73      0.78       200
weighted avg       0.93      0.94      0.93       200



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Forest

In [5]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.97
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       181
           1       1.00      0.68      0.81        19

    accuracy                           0.97       200
   macro avg       0.98      0.84      0.90       200
weighted avg       0.97      0.97      0.97       200



XGBoost

In [6]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state=42)

xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

XGBoost Accuracy: 0.985
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       181
           1       1.00      0.84      0.91        19

    accuracy                           0.98       200
   macro avg       0.99      0.92      0.95       200
weighted avg       0.99      0.98      0.98       200



Random Forest Hyperparameter Tuning

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)

grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

grid_search_rf.fit(X_train, y_train)

print(f"Best parameters for Random Forest: {grid_search_rf.best_params_}")
print(f"Best score for Random Forest: {grid_search_rf.best_score_}")

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters for Random Forest: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best score for Random Forest: 0.9712436522016651


XGBoost Hyperparameter Tuning

In [8]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb = XGBClassifier(random_state=42)

grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

grid_search_xgb.fit(X_train, y_train)

print(f"Best parameters for XGBoost: {grid_search_xgb.best_params_}")
print(f"Best score for XGBoost: {grid_search_xgb.best_score_}")

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters for XGBoost: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}
Best score for XGBoost: 0.9875015253489529


SMOTE Implementation

In [9]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

smote = SMOTE(random_state=42)

pipeline = Pipeline([('smote', smote), ('classifier', RandomForestClassifier())])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print(f"Accuracy with SMOTE: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy with SMOTE: 0.935
              precision    recall  f1-score   support

           0       0.98      0.94      0.96       181
           1       0.62      0.84      0.71        19

    accuracy                           0.94       200
   macro avg       0.80      0.89      0.84       200
weighted avg       0.95      0.94      0.94       200



Feature Scaling

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf_scaled = RandomForestClassifier(random_state=42)
rf_scaled.fit(X_train_scaled, y_train)

y_pred_scaled = rf_scaled.predict(X_test_scaled)
print(f"Accuracy with scaled features: {accuracy_score(y_test, y_pred_scaled)}")
print(classification_report(y_test, y_pred_scaled))

Accuracy with scaled features: 0.965
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       181
           1       1.00      0.63      0.77        19

    accuracy                           0.96       200
   macro avg       0.98      0.82      0.88       200
weighted avg       0.97      0.96      0.96       200



Model Evaluation and Comparison

In [11]:
from sklearn.metrics import accuracy_score, classification_report

y_pred_rf = grid_search_rf.predict(X_test)
y_pred_xgb = grid_search_xgb.predict(X_test)

print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))

print("XGBoost Performance:")
print(classification_report(y_test, y_pred_xgb))

Random Forest Performance:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       181
           1       1.00      0.63      0.77        19

    accuracy                           0.96       200
   macro avg       0.98      0.82      0.88       200
weighted avg       0.97      0.96      0.96       200

XGBoost Performance:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       181
           1       1.00      0.84      0.91        19

    accuracy                           0.98       200
   macro avg       0.99      0.92      0.95       200
weighted avg       0.99      0.98      0.98       200



Voting Classifier

In [12]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

log_reg = LogisticRegression(max_iter=1000)
random_forest = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2, min_samples_leaf=1)
xgboost = XGBClassifier(colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8)

voting_clf = VotingClassifier(estimators=[('log_reg', log_reg), ('rf', random_forest), ('xgb', xgboost)], voting='hard')
voting_clf.fit(X_train, y_train)

y_pred_voting = voting_clf.predict(X_test)
print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred_voting))
print(classification_report(y_test, y_pred_voting))

Voting Classifier Accuracy: 0.975
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       181
           1       1.00      0.74      0.85        19

    accuracy                           0.97       200
   macro avg       0.99      0.87      0.92       200
weighted avg       0.98      0.97      0.97       200



Stacking Classifier

In [13]:
from sklearn.ensemble import StackingClassifier

base_learners = [
    ('log_reg', LogisticRegression(max_iter=1000)),
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2, min_samples_leaf=1)),
    ('xgb', XGBClassifier(colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=300, subsample=0.8))
]

meta_model = LogisticRegression()

stacking_clf = StackingClassifier(estimators=base_learners, final_estimator=meta_model)
stacking_clf.fit(X_train, y_train)

y_pred_stacking = stacking_clf.predict(X_test)
print("Stacking Classifier Accuracy:", accuracy_score(y_test, y_pred_stacking))
print(classification_report(y_test, y_pred_stacking))

Stacking Classifier Accuracy: 0.985
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       181
           1       1.00      0.84      0.91        19

    accuracy                           0.98       200
   macro avg       0.99      0.92      0.95       200
weighted avg       0.99      0.98      0.98       200



In [14]:
import joblib

best_xgb_model = grid_search_xgb.best_estimator_
joblib.dump(best_xgb_model, 'best_xgboost_model.pkl')

print("Model saved successfully!")

Model saved successfully!
