In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

np.random.seed(246)

In [2]:
#Import df

merged_df = pd.read_pickle("merged_df.pkl")
target_cols = ['cooler_condition', 'valve_condition', 'pump_leakage',
               'hydraulic_accumulator', 'stable_flag']

feature_df = merged_df[[col for col in merged_df.columns if col not in target_cols]]
target_df = merged_df[target_cols]

feature_df = feature_df.drop(columns=["ce", "se"])

In [3]:
# Distinguish the target conditions

y_cooler_condition = pd.DataFrame(target_df.iloc[:, 0])
y_valve_condition = pd.DataFrame(target_df.iloc[:, 1])
y_pump_leak = pd.DataFrame(target_df.iloc[:, 2])
y_hydraulic_accumulator = pd.DataFrame(target_df.iloc[:, 3])
y_stable_flag = pd.DataFrame(target_df.iloc[:, 4])

target_columns = [y_cooler_condition, y_valve_condition, y_pump_leak, y_hydraulic_accumulator]

In [4]:
# Normalizing features

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

scaled = preprocessing.StandardScaler().fit_transform(feature_df)

# MACHINE LEARNING

In [5]:
# Train and test split

from sklearn.model_selection import train_test_split

train, test = train_test_split(merged_df, test_size=.2)

train_X = train[feature_df.columns].reset_index(drop=True)
train_y = train[target_df.columns].reset_index(drop=True)
test_X = test[feature_df.columns].reset_index(drop=True)
test_y = test[target_df.columns].reset_index(drop=True)

In [6]:
# Defining the models we want to test

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

models = {"Logistic Regression": LogisticRegression(solver="liblinear"),
          "Decision Tree": DecisionTreeClassifier(),
          "Random Forest": RandomForestClassifier(n_estimators=10),
          "K Nearest Neighbors": KNeighborsClassifier(n_neighbors=3),
          "Support Vector Machine": svm.SVC()}

In [7]:
# Model Evaluation

from sklearn.model_selection import cross_validate

def validate_model(model, train_X, train_y):
    validation_results = cross_validate(model,
                                        train_X,
                                        train_y,
                                        cv=5,
                                        scoring="precision_micro")
    acc_score = validation_results["test_score"].mean()
    print(f"Precision Mean Score: {acc_score}")

In [8]:
# Cooler Condition
for key, model in models.items():
    print(f"{key}:")
    validate_model(model, train_X, train_y["cooler_condition"])
    print("========================================")

Logistic Regression:
Precision Mean Score: 0.9965973292244479
Decision Tree:
Precision Mean Score: 0.9994318181818181
Random Forest:
Precision Mean Score: 0.9971623009758602
K Nearest Neighbors:
Precision Mean Score: 0.9807203389830509
Support Vector Machine:
Precision Mean Score: 0.9835580380071904


In [9]:
# Valve Condition
for key, model in models.items():
    print(f"{key}:")
    validate_model(model, train_X, train_y["valve_condition"])
    print("========================================")

Logistic Regression:
Precision Mean Score: 0.7443333063849702
Decision Tree:
Precision Mean Score: 0.9246515288744126
Random Forest:
Precision Mean Score: 0.9427579698291995
K Nearest Neighbors:
Precision Mean Score: 0.6610302985098923
Support Vector Machine:
Precision Mean Score: 0.5759760302796385


In [10]:
# Hydraulic Accumulator
for key, model in models.items():
    print(f"{key}:")
    validate_model(model, train_X, train_y["hydraulic_accumulator"])
    print("========================================")

Logistic Regression:
Precision Mean Score: 0.530608199156308
Decision Tree:
Precision Mean Score: 0.9319223964699865
Random Forest:
Precision Mean Score: 0.9665493773871662
K Nearest Neighbors:
Precision Mean Score: 0.8475004352667641
Support Vector Machine:
Precision Mean Score: 0.8123693836985343


In [11]:
# Pump Leakage
for key, model in models.items():
    print(f"{key}:")
    validate_model(model, train_X, train_y["pump_leakage"])
    print("========================================")

Logistic Regression:
Precision Mean Score: 0.9449956604989735
Decision Tree:
Precision Mean Score: 0.9818599763418989
Random Forest:
Precision Mean Score: 0.9892318628499428
K Nearest Neighbors:
Precision Mean Score: 0.9535151413302371
Support Vector Machine:
Precision Mean Score: 0.9467114093593544


## Recursive Feature Elimination - RFE and test set evaluation

In [12]:
from sklearn.feature_selection import RFECV
from sklearn.metrics import precision_score, confusion_matrix

In [13]:
# Cooler Condition

selector_rf = RFECV(RandomForestClassifier(n_estimators=10), cv=5, scoring="precision_micro" )

selector_rf.fit(train_X, train_y["cooler_condition"]);
train_X2 = train_X[train_X.columns[selector_rf.get_support()].tolist()]


# Precision Score for test set
model = RandomForestClassifier()
model.fit(train_X2, train_y["cooler_condition"])

test_X2 = test_X[train_X2.columns.to_list()]
pred = model.predict(test_X2)

print("Score")
print(round(precision_score(test_y["cooler_condition"], pred, average='macro'), 10))
print("==============")
print("Confusion Matrix")
print(confusion_matrix(test_y["cooler_condition"], pred))

Score
0.9958592133
Confusion Matrix
[[140   0   2]
 [  0 140   0]
 [  0   0 159]]


In [14]:
# Valve Condition

selector_rf = RFECV(RandomForestClassifier(n_estimators=10), cv=5, scoring="precision_micro" )

selector_rf.fit(train_X, train_y["valve_condition"]);
train_X2 = train_X[train_X.columns[selector_rf.get_support()].tolist()]


# Precision Score for test set
model = RandomForestClassifier()
model.fit(train_X2, train_y["valve_condition"])

test_X2 = test_X[train_X2.columns.to_list()]
pred = model.predict(test_X2)

precision_score(test_y["valve_condition"], pred, average='macro')

0.9480156397949522

In [15]:
# Hydraulic Accumulator

selector_rf = RFECV(RandomForestClassifier(n_estimators=10), cv=5, scoring="precision_micro" )

selector_rf.fit(train_X, train_y["hydraulic_accumulator"]);
train_X2 = train_X[train_X.columns[selector_rf.get_support()].tolist()]


# Precision Score for test set
model = RandomForestClassifier()
model.fit(train_X2, train_y["hydraulic_accumulator"])

test_X2 = test_X[train_X2.columns.to_list()]
pred = model.predict(test_X2)

precision_score(test_y["hydraulic_accumulator"], pred, average='macro')

0.9554572499554312

In [16]:
# Pump Leakage

selector_rf = RFECV(RandomForestClassifier(n_estimators=10), cv=5, scoring="precision_micro" )

selector_rf.fit(train_X, train_y["pump_leakage"]);
train_X2 = train_X[train_X.columns[selector_rf.get_support()].tolist()]


# Precision Score for test set
model = RandomForestClassifier()
model.fit(train_X2, train_y["pump_leakage"])

test_X2 = test_X[train_X2.columns.to_list()]
pred = model.predict(test_X2)

precision_score(test_y["pump_leakage"], pred, average='macro')

0.9510003114187224