In [1]:
# ml_equipment_failure.py
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import joblib

# 1. Load
df = pd.read_csv('equipment_data.csv', parse_dates=['timestamp'])
df = df.sort_values(['device_id','timestamp'])

# 2. Feature engineering - window aggregates per device (last 24h mean, std)
df['hour'] = df['timestamp'].dt.hour
df['day'] = df['timestamp'].dt.date

# Example: aggregate features per day per device
agg = df.groupby(['device_id','day']).agg({
    'temp_c':['mean','std','max'],
    'vibration_g':['mean','std','max'],
    'pressure_psi':['mean','std'],
    'rpm':['mean','std'],
    'hours_running':'max',
    'failure_next_7d':'max'  # label: if any failure in next 7 days
}).reset_index()
agg.columns = ['_'.join(col).strip('_') for col in agg.columns.values]
agg.rename(columns={'device_id_':'device_id','day_':'day','failure_next_7d_max':'label'}, inplace=True)

# 3. Train/test split
X = agg.drop(columns=['device_id','day','label'])
y = agg['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 4. Pipeline + model
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])

# 5. Quick grid search (small)
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [6, 12, None],
}
cv = GridSearchCV(pipe, param_grid, cv=3, scoring='roc_auc', verbose=1)
cv.fit(X_train, y_train)

print("Best params:", cv.best_params_)
best = cv.best_estimator_

# 6. Evaluate
y_pred = best.predict(X_test)
y_proba = best.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# 7. Feature importances
feat_imp = best.named_steps['clf'].feature_importances_
features = X.columns
imp_df = pd.DataFrame({'feature':features, 'importance':feat_imp}).sort_values('importance', ascending=False)
print(imp_df.head(20))

# 8. Save model

os.makedirs('models', exist_ok=True)
joblib.dump(best, 'models/equip_failure_rf.joblib')


Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best params: {'clf__max_depth': 12, 'clf__n_estimators': 200}
              precision    recall  f1-score   support

           0       0.90      1.00      0.95       123
           1       1.00      0.83      0.91        77

    accuracy                           0.94       200
   macro avg       0.95      0.92      0.93       200
weighted avg       0.94      0.94      0.93       200

ROC AUC: 0.9404497941083307
Confusion matrix:
 [[123   0]
 [ 13  64]]
              feature  importance
5     vibration_g_max    0.382278
3    vibration_g_mean    0.195401
2          temp_c_max    0.109621
0         temp_c_mean    0.060855
4     vibration_g_std    0.047315
1          temp_c_std    0.044772
6   pressure_psi_mean    0.035631
7    pressure_psi_std    0.032600
10  hours_running_max    0.032112
9             rpm_std    0.031762
8            rpm_mean    0.027653


['models/equip_failure_rf.joblib']