## Lectorial 4 - Predictive Maintenance for Cars
*Author: Dominik Jung (dominik.jung42@gmail.com)*

### Import packages

In [29]:
from pandas import read_excel

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

### 1. Data selection and preprocessing

In [2]:
dataset = read_excel(open("car_maintenance.xlsx", "rb"))

Show dataset

In [3]:
dataset.head(6)

Unnamed: 0,PART_1023,PART_99,PART_02,OIL,CHECK_STATUS,FOLLOW-UP
0,1,2,1,0.25,acc,yes
1,2,1,1,0.15,unacc,no
2,1,2,1,0.25,acc,yes
3,1,1,1,0.15,good,yes
4,2,1,1,0.15,vgood,no
5,1,1,1,0.5,acc,yes


In [4]:
dataset.dtypes

PART_1023         int64
PART_99           int64
PART_02           int64
OIL             float64
CHECK_STATUS     object
FOLLOW-UP        object
dtype: object

In [5]:
dataset.describe()

Unnamed: 0,PART_1023,PART_99,PART_02,OIL
count,31.0,31.0,31.0,31.0
mean,1.580645,33.387097,1.0,0.180645
std,0.672022,179.210617,0.0,0.16004
min,1.0,1.0,1.0,0.05
25%,1.0,1.0,1.0,0.1
50%,1.0,1.0,1.0,0.15
75%,2.0,1.0,1.0,0.2
max,3.0,999.0,1.0,0.8


In [6]:
dataset["PART_99"]

0       2
1       1
2       2
3       1
4       1
5       1
6       1
7       1
8       1
9     999
10      2
11      1
12      2
13      1
14      1
15      1
16      1
17      1
18      1
19      2
20      1
21      1
22      1
23      1
24      1
25      1
26      1
27      1
28      1
29      2
30      1
Name: PART_99, dtype: int64

Remove outliers, threshold = 3 (most values are between 1 and 2)

In [7]:
# thr = dataset["PART_99"].quantile(0.999)
thr = 3
dataset = dataset[dataset["PART_99"] < thr]
dataset.describe()

Unnamed: 0,PART_1023,PART_99,PART_02,OIL
count,30.0,30.0,30.0,30.0
mean,1.566667,1.2,1.0,0.181667
std,0.678911,0.406838,0.0,0.162673
min,1.0,1.0,1.0,0.05
25%,1.0,1.0,1.0,0.1
50%,1.0,1.0,1.0,0.15
75%,2.0,1.0,1.0,0.225
max,3.0,2.0,1.0,0.8


In [8]:
dataset = dataset.drop(columns=["PART_02"])
dataset.head()

Unnamed: 0,PART_1023,PART_99,OIL,CHECK_STATUS,FOLLOW-UP
0,1,2,0.25,acc,yes
1,2,1,0.15,unacc,no
2,1,2,0.25,acc,yes
3,1,1,0.15,good,yes
4,2,1,0.15,vgood,no


## 2. Analytics

### Split data for modelling

In [16]:
dataset["CHECK_STATUS"] = dataset["CHECK_STATUS"].astype("category")
dataset["CHECK_STATUS"] = dataset["CHECK_STATUS"].cat.codes

In [17]:
X = dataset.loc[:, "PART_1023":"CHECK_STATUS"]
Y = dataset.loc[:, "FOLLOW-UP"]

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

### Simple decision tree

In [19]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, Y_train)

In [20]:
dataset.head()

Unnamed: 0,PART_1023,PART_99,OIL,CHECK_STATUS,FOLLOW-UP
0,1,2,0.25,0,yes
1,2,1,0.15,2,no
2,1,2,0.25,0,yes
3,1,1,0.15,1,yes
4,2,1,0.15,3,no


In [23]:
prediction = clf.predict(X_test)
prediction

array(['no', 'no', 'yes', 'yes', 'no', 'no', 'no', 'no', 'no'],
      dtype=object)

In [24]:
X_test

Unnamed: 0,PART_1023,PART_99,OIL,CHECK_STATUS
15,1,1,0.1,1
13,1,1,0.15,2
11,2,1,0.8,2
3,1,1,0.15,1
2,1,2,0.25,0
10,1,2,0.15,0
6,2,1,0.1,2
4,2,1,0.15,3
0,1,2,0.25,0


## Evaluation

In [27]:
confusion_matrix(Y_test, prediction)

array([[4, 0],
       [3, 2]], dtype=int64)

In [28]:
accuracy_score(Y_test, prediction)

0.6666666666666666

In [38]:
rnd_clf = RandomForestClassifier(n_estimators=1000, max_leaf_nodes=20)
rnd_clf = rnd_clf.fit(X_train, Y_train)
prediction = rnd_clf.predict(X_test)

In [39]:
prediction

array(['no', 'no', 'yes', 'yes', 'no', 'no', 'no', 'no', 'no'],
      dtype=object)

In [40]:
accuracy_score(Y_test, prediction)

0.6666666666666666

In [None]:
plot_tree(clf)