In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd

<IPython.core.display.Javascript object>

# Import dataset

In [13]:
car_data = pd.read_csv("data/car.data", header=None)
car_data.columns = [
    "buying",
    "maint",
    "doors",
    "persons",
    "lug_boot",
    "safety",
    "class",
]

<IPython.core.display.Javascript object>

In [14]:
car_data

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


<IPython.core.display.Javascript object>

# Data Exploration

In [16]:
# Check missing values
car_data.isna().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

<IPython.core.display.Javascript object>

In [91]:
car_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null object
maint       1728 non-null object
doors       1728 non-null object
persons     1728 non-null object
lug_boot    1728 non-null object
safety      1728 non-null object
class       1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


<IPython.core.display.Javascript object>

In [147]:
# Check target distribution
car_data["buying"].value_counts()

vhigh    432
med      432
high     432
low      432
Name: buying, dtype: int64

<IPython.core.display.Javascript object>

In [42]:
# All the columns are categorical -> One-hot encoding
enc_car_data = pd.get_dummies(car_data.drop("buying", axis=1))

<IPython.core.display.Javascript object>

In [148]:
enc_car_data.head()

Unnamed: 0,maint_high,maint_low,maint_med,maint_vhigh,doors_2,doors_3,doors_4,doors_5more,persons_2,persons_4,...,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med,class_acc,class_good,class_unacc,class_vgood
0,0,0,0,1,1,0,0,0,1,0,...,0,0,1,0,1,0,0,0,1,0
1,0,0,0,1,1,0,0,0,1,0,...,0,0,1,0,0,1,0,0,1,0
2,0,0,0,1,1,0,0,0,1,0,...,0,0,1,1,0,0,0,0,1,0
3,0,0,0,1,1,0,0,0,1,0,...,0,1,0,0,1,0,0,0,1,0
4,0,0,0,1,1,0,0,0,1,0,...,0,1,0,0,0,1,0,0,1,0


<IPython.core.display.Javascript object>

In [153]:
# Encoding target variable using Label Encoder
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(car_data["buying"])
buying_price = le.transform(car_data["buying"])

<IPython.core.display.Javascript object>

In [155]:
le.classes_

array(['high', 'low', 'med', 'vhigh'], dtype=object)

<IPython.core.display.Javascript object>

In [154]:
buying_price

array([3, 3, 3, ..., 1, 1, 1])

<IPython.core.display.Javascript object>

# Splitting dataset

In [44]:
from sklearn.model_selection import train_test_split

<IPython.core.display.Javascript object>

In [170]:
x_train, x_test, y_train, y_test = train_test_split(
    enc_car_data, car_data["buying"], test_size=0.2
)

<IPython.core.display.Javascript object>

In [184]:
x_train.shape, y_train.shape

((1382, 21), (1382,))

<IPython.core.display.Javascript object>

In [185]:
x_test.shape, y_test.shape

((346, 21), (346,))

<IPython.core.display.Javascript object>

# Training models

In [186]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix

<IPython.core.display.Javascript object>

## Decision Tree

In [187]:
from sklearn.tree import DecisionTreeClassifier

<IPython.core.display.Javascript object>

In [221]:
# Hyperparameter tuning
for depth in [2, 3, 4, 5]:
    dt_model = DecisionTreeClassifier(criterion="gini", max_depth=depth)
    # Cross-validation
    cv_results = cross_validate(dt_model, x_train, y_train, cv=5)
    print(
        f"Max depth: {depth} --- Accuracy results: {round(cv_results['test_score'].mean(),4)}"
    )

Max depth: 2 --- Accuracy results: 0.296
Max depth: 3 --- Accuracy results: 0.3082
Max depth: 4 --- Accuracy results: 0.309
Max depth: 5 --- Accuracy results: 0.2757


<IPython.core.display.Javascript object>

In [224]:
# Best model
dt_model = DecisionTreeClassifier(criterion="gini", max_depth=3)
dt_model.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

<IPython.core.display.Javascript object>

In [248]:
# Accuracy on test set
y_pred = dt_model.predict(x_test)
accuracy = accuracy_score(y_pred, y_test)
print("Model accuracy score: {0:0.4f}".format(accuracy_score(y_test, y_pred)))

Model accuracy score: 0.3150


<IPython.core.display.Javascript object>

In [249]:
columns = y_test.unique()
pd.DataFrame(
    confusion_matrix(y_test, y_pred, labels=columns),
    columns=columns + "_pred",
    index=columns + "_true",
)

Unnamed: 0,vhigh_pred,low_pred,high_pred,med_pred
vhigh_true,72,0,0,13
low_true,57,9,0,26
high_true,63,0,0,21
med_true,56,1,0,28


<IPython.core.display.Javascript object>

## Gradient-boosted decision tree

### Training the model

In [226]:
import lightgbm as lgb

<IPython.core.display.Javascript object>

In [230]:
# Hyperparameter tuning
best_model = {"accuracy": 0}
for leaves in [2, 3, 4, 6, 8]:
    for lr in [0.5, 0.1, 0.5, 0.01]:
        for n_est in [100, 200, 500]:
            lgb_model = lgb.LGBMClassifier(
                objective="multiclass",
                num_leaves=leaves,
                boosting_type="dart",
                learning_rate=lr,
                n_estimators=n_est,
                num_threads=4,
            )
            cv_results = cross_validate(lgb_model, x_train, y_train, cv=5)
            if cv_results["test_score"].mean() > best_model["accuracy"]:
                best_model["accuracy"] = cv_results["test_score"].mean()
                best_model["leaves"] = leaves
                best_model["lr"] = lr
                best_model["n_est"] = n_est
                print(
                    f"Leaves: {leaves}, lr: {lr}, n_est: {n_est} --- Accuracy results: {round(cv_results['test_score'].mean(),4)}"
                )

Leaves: 2, lr: 0.5, n_est: 100 --- Accuracy results: 0.3025
Leaves: 2, lr: 0.5, n_est: 200 --- Accuracy results: 0.309
Leaves: 2, lr: 0.5, n_est: 500 --- Accuracy results: 0.3119
Leaves: 2, lr: 0.1, n_est: 100 --- Accuracy results: 0.3148
Leaves: 2, lr: 0.01, n_est: 100 --- Accuracy results: 0.3163
Leaves: 2, lr: 0.01, n_est: 200 --- Accuracy results: 0.317
Leaves: 2, lr: 0.01, n_est: 500 --- Accuracy results: 0.3192
Leaves: 3, lr: 0.01, n_est: 100 --- Accuracy results: 0.3221
Leaves: 3, lr: 0.01, n_est: 200 --- Accuracy results: 0.3228
Leaves: 3, lr: 0.01, n_est: 500 --- Accuracy results: 0.3257


<IPython.core.display.Javascript object>

In [250]:
# Best model
lgb_model = lgb.LGBMClassifier(
    objective="multiclass",
    num_leaves=best_model["leaves"],
    boosting_type="dart",
    learning_rate=best_model["lr"],
    n_estimators=best_model["n_est"],
    num_threads=4,
)
lgb_model.fit(x_train, y_train)

LGBMClassifier(boosting_type='dart', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.01, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=500, n_jobs=-1, num_leaves=3, num_threads=4,
               objective='multiclass', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

<IPython.core.display.Javascript object>

In [252]:
# Final accuracy on test set
y_pred = lgb_model.predict(x_test)
accuracy = accuracy_score(y_pred, y_test)
print("Model accuracy score: {0:0.4f}".format(accuracy_score(y_test, y_pred)))

Model accuracy score: 0.3266


<IPython.core.display.Javascript object>

In [253]:
pd.DataFrame(
    confusion_matrix(y_test, y_pred, labels=columns),
    columns=columns + "_pred",
    index=columns + "_true",
)

Unnamed: 0,vhigh_pred,low_pred,high_pred,med_pred
vhigh_true,72,0,0,13
low_true,57,18,0,17
high_true,63,0,0,21
med_true,56,6,0,23


<IPython.core.display.Javascript object>

# Predicting based on the parameters given
- Maintenance = High
- Number of doors = 4
- Lug Boot Size = Big
- Safety = High
- Class Value = Good
- persons = ??

In [259]:
sample = {
    "maint_high": 1,
    "maint_low": 0,
    "maint_med": 0,
    "maint_vhigh": 0,
    "doors_2": 0,
    "doors_3": 0,
    "doors_4": 1,
    "doors_5more": 0,
    "persons_2": 0,
    "persons_4": 0,
    "persons_more": 0,
    "lug_boot_big": 1,
    "lug_boot_med": 0,
    "lug_boot_small": 0,
    "safety_high": 1,
    "safety_low": 0,
    "safety_med": 0,
    "class_acc": 0,
    "class_good": 1,
    "class_unacc": 0,
    "class_vgood": 0,
}

<IPython.core.display.Javascript object>

In [263]:
import numpy as np

<IPython.core.display.Javascript object>

In [271]:
prediction = lgb_model.predict([list(sample.values())])

<IPython.core.display.Javascript object>

In [273]:
print(f"Predicted buying price: {prediction[0]}")

Predicted buying price: low


<IPython.core.display.Javascript object>