# Machine Learning Predictive Maintenance Classifier Beta
## Griffin Brown
### 8/22/2024

In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

In [39]:
df = pd.read_csv("predictive_maintenance.csv")

df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


In [40]:
df.drop(['Product ID', 'UDI'], axis=1, inplace=True)

def preprocess_dataframe(data):
    object_cols = data.select_dtypes(include='object').columns.to_list()
    for col in object_cols:
        data[col] = data[col].astype('category')

    category_cols = [col for col in data.columns if data[col].dtype == 'category']

    # for col in category_cols:
    #     df[col], _ = pd.factorize(df[col])

    for col in category_cols:
        dummies = pd.get_dummies(data[col], drop_first=True, dummy_na=True, prefix=col)
        data = pd.concat([data.drop(col, axis=1), dummies], axis=1)

    return data

df = preprocess_dataframe(df)

In [41]:
df.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Type_L,Type_M,Type_nan,Failure Type_No Failure,Failure Type_Overstrain Failure,Failure Type_Power Failure,Failure Type_Random Failures,Failure Type_Tool Wear Failure,Failure Type_nan
0,298.1,308.6,1551,42.8,0,0,False,True,False,True,False,False,False,False,False
1,298.2,308.7,1408,46.3,3,0,True,False,False,True,False,False,False,False,False
2,298.1,308.5,1498,49.4,5,0,True,False,False,True,False,False,False,False,False
3,298.2,308.6,1433,39.5,7,0,True,False,False,True,False,False,False,False,False
4,298.2,308.7,1408,40.0,9,0,True,False,False,True,False,False,False,False,False


In [42]:
y = df['Target']
X = df.drop(['Target'], axis=1)

X.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Type_L,Type_M,Type_nan,Failure Type_No Failure,Failure Type_Overstrain Failure,Failure Type_Power Failure,Failure Type_Random Failures,Failure Type_Tool Wear Failure,Failure Type_nan
0,298.1,308.6,1551,42.8,0,False,True,False,True,False,False,False,False,False
1,298.2,308.7,1408,46.3,3,True,False,False,True,False,False,False,False,False
2,298.1,308.5,1498,49.4,5,True,False,False,True,False,False,False,False,False
3,298.2,308.6,1433,39.5,7,True,False,False,True,False,False,False,False,False
4,298.2,308.7,1408,40.0,9,True,False,False,True,False,False,False,False,False


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [44]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7000, 14)
(3000, 14)
(7000,)
(3000,)


In [45]:
imputer = SimpleImputer(missing_values=pd.NA, strategy='constant')

In [46]:
lr = LogisticRegression()

In [47]:
pipe1 = make_pipeline(imputer, lr)

In [48]:
pipe1.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [49]:
pipe1.score(X_train, y_train)

0.9977142857142857

In [50]:
pipe1.score(X_test, y_test)

0.9976666666666667

In [51]:
pipe1.named_steps.simpleimputer.statistics_

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [52]:
pipe1.named_steps.logisticregression.coef_

array([[ 7.26770519e-01, -7.39029791e-01,  4.10577319e-03,
         1.11415528e-01,  4.86359981e-03,  4.38880343e-01,
        -1.71310749e-01,  0.00000000e+00, -7.05989087e+00,
         1.69194271e+00,  1.50597157e+00, -1.09873974e-01,
         1.55342725e+00,  0.00000000e+00]])

In [53]:
# confusion matrix
y_train_pred = pipe1.predict(X_train)
conf_matrix = confusion_matrix(y_train, y_train_pred)
print(conf_matrix)

[[6754    9]
 [   7  230]]


In [54]:
# other scores
precision = precision_score(y_train, y_train_pred, average='weighted')
recall = recall_score(y_train, y_train_pred, average='weighted')
f1 = f1_score(y_train, y_train_pred, average='weighted')

print(precision)
print(recall)
print(f1)

0.9977247490154187
0.9977142857142857
0.9977189186233499


In [55]:
# auc
roc_auc = roc_auc_score(y_train, pipe1.predict_proba(X_train)[:,1])

print(roc_auc)

0.9878371456504148


In [57]:
# cross validation
cv_scores = cross_val_score(pipe1, X_train, y_train, cv=5, scoring='accuracy')

print(cv_scores)
print(cv_scores.mean())

[0.99714286 0.99642857 0.99857143 0.99857143 0.99785714]
0.9977142857142857


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [60]:
coefs = np.abs(pipe1.named_steps['logisticregression'].coef_[0])
feature_importance = pd.Series(coefs, index=X_train.columns)
feature_importance.sort_values(ascending=False, inplace=True)

print(feature_importance)

Failure Type_No Failure            7.059891
Failure Type_Overstrain Failure    1.691943
Failure Type_Tool Wear Failure     1.553427
Failure Type_Power Failure         1.505972
Process temperature [K]            0.739030
Air temperature [K]                0.726771
Type_L                             0.438880
Type_M                             0.171311
Torque [Nm]                        0.111416
Failure Type_Random Failures       0.109874
Tool wear [min]                    0.004864
Rotational speed [rpm]             0.004106
Type_nan                           0.000000
Failure Type_nan                   0.000000
dtype: float64
