In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import os

import sys
sys.path.append("..")
from utils.serialization import save_model, load_model

### Dataset

In [2]:
df = pd.read_csv("car.data")
print(df.shape)
df.head(5)

(1728, 7)


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
predict = 'class'
features, target = df.drop(columns=predict), df[predict]
X = np.array(
    list(map(LabelEncoder().fit_transform, (features[col] for col in features)))
).T
y = LabelEncoder().fit_transform(target)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_train.shape, y_train.shape

((1382, 6), (1382,))

### Grid Search on Random Forest

In [4]:
pipe_rf = Pipeline([('pca', PCA()), ('rf', RandomForestClassifier())])
param_grid = [
    {
        'pca__n_components': range(3, X_train.shape[1] + 1, 2),
        'rf__n_estimators': range(10, 20),
        'rf__criterion': ['gini', 'entropy'],
        'rf__max_depth': range(5, 15),
    }
]
gs = GridSearchCV(
    estimator=pipe_rf,
    param_grid=param_grid,
    scoring='accuracy'
)

gs.fit(X_train, y_train)
# print(pd.DataFrame(gs.cv_results_).head(10))
print(f"best_params: {gs.best_params_}\nbest_acc: {gs.best_score_:.4f} %")

best_params: {'pca__n_components': 5, 'rf__criterion': 'entropy', 'rf__max_depth': 13, 'rf__n_estimators': 17}
best_acc: 0.8893 %


### Evaluation

In [5]:
prediction = gs.predict(X_test)
acc = accuracy_score(y_test, prediction)
recall = recall_score(y_test, prediction, average='macro')
precision = precision_score(y_test, prediction, average='macro')
f1 = f1_score(y_test, prediction, average='macro')
print(f"accuracy: {acc * 100:.1f} %")
print(f"recall: {recall * 100:.1f} %")
print(f"precision: {precision * 100:.1f} %")
print(f"f1_score: {f1 * 100:.1f} %")

accuracy: 86.1 %
recall: 64.9 %
precision: 68.8 %
f1_score: 66.6 %


### Save Model

In [6]:
path = os.path.join(os.getcwd(), "trained_models/model_Random_Forest.pickle")
save_model(path, gs)

### Load Model

In [7]:
path = os.path.join(os.getcwd(), "trained_models/model_Random_Forest.pickle")
model = load_model(path)
model.predict(X_test)

array([2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 3, 2, 2, 0, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 1, 2, 0, 1, 2, 2, 2, 0, 2, 2, 2, 3, 2, 2,
       1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 2, 0, 2, 0, 2, 0, 3, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 3, 2, 2, 2, 0,
       2, 2, 0, 1, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 0, 2, 2, 1, 0, 2, 2, 2, 0,
       2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2,
       2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 1, 2, 0, 2, 0,
       2, 3, 3, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 3, 2, 2, 0, 2, 0,
       2, 2, 3, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0,
       2, 0, 2, 3, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2,
       2, 2, 2, 0, 2, 2, 1, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0,
       0, 2, 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 0, 2, 0,