## Install and Update category_encoders library

In [None]:
!pip install --upgrade category_encoders

## Import Required Libraries

In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import category_encoders as ce 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, precision_score, accuracy_score, plot_confusion_matrix, classification_report, f1_score

# Import Dataset

In [51]:
data = pd.read_csv("data/car-evaluation.csv",header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


## Data Exploration and Data Processing

In [52]:
# Add column names into the dataframe
col_names = ['buying','maint','doors','persons','lug_boot','safety','class']
data.columns = col_names
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [53]:
# check for metadata of columns
# no null data in all the columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [54]:
# show unique categories in each column
def show(data):
  for i in data.columns[1:]:
    print("Feature: {} with {} Levels".format(i,data[i].unique()))

show(data)

Feature: maint with ['vhigh' 'high' 'med' 'low'] Levels
Feature: doors with ['2' '3' '4' '5more'] Levels
Feature: persons with ['2' '4' 'more'] Levels
Feature: lug_boot with ['small' 'med' 'big'] Levels
Feature: safety with ['low' 'med' 'high'] Levels
Feature: class with ['unacc' 'acc' 'vgood' 'good'] Levels


In [55]:
# encode data to numeric format for creating the classification model

encoder = ce.OrdinalEncoder(cols = ['buying','maint','doors','persons','lug_boot','safety','class'])
data = encoder.fit_transform(data)
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,1,1,1,1,1,1,1
1,1,1,1,1,1,2,1
2,1,1,1,1,1,3,1
3,1,1,1,1,2,1,1
4,1,1,1,1,2,2,1


## Preparing Data for the Predictive Model

In [56]:
# Split Data into Train Dataset and Test Dataset
x = data.drop(['buying'], axis = 1)
y = data['buying']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)
print("X_train: {}".format(x_train.shape))
print("X_test: {}".format(x_test.shape))
print("Y_train: {}".format(y_train.shape))
print("Y_test: {}".format(y_test.shape))

X_train: (1209, 6)
X_test: (519, 6)
Y_train: (1209,)
Y_test: (519,)


In [57]:
# Create a function to evaluate various algorithms
def evaluation_parametrics(y_train,yp_train,y_test,yp_test):
  print("--------------------------------------------------------------------------")
  print("Classification Report for Train Data")
  print(classification_report(y_train, yp_train))
  print("Classification Report for Test Data")
  print(classification_report(y_test, yp_test))
  print("--------------------------------------------------------------------------")
  # Accuracy
  print("Accuracy on Train Data is: {}".format(round(accuracy_score(y_train,yp_train),2)))
  print("Accuracy on Test Data is: {}".format(round(accuracy_score(y_test,yp_test),2)))
  print("--------------------------------------------------------------------------")
  # Precision
  print("Precision on Train Data is: {}".format(round(precision_score(y_train,yp_train,average = "weighted"),2)))
  print("Precision on Test Data is: {}".format(round(precision_score(y_test,yp_test,average = "weighted"),2)))
  print("--------------------------------------------------------------------------")
  # Recall 
  print("Recall on Train Data is: {}".format(round(recall_score(y_train,yp_train,average = "weighted"),2)))
  print("Recall on Test Data is: {}".format(round(recall_score(y_test,yp_test,average = "weighted"),2)))
  print("--------------------------------------------------------------------------")
  # F1 Score
  print("F1 Score on Train Data is: {}".format(round(f1_score(y_train,yp_train,average = "weighted"),2)))
  print("F1 Score on Test Data is: {}".format(round(f1_score(y_test,yp_test,average = "weighted"),2)))
  print("--------------------------------------------------------------------------")

## Evaluating Various Predictive Models

In [71]:
lr = LogisticRegression(max_iter = 1000,random_state = 48)
lr.fit(x_train,y_train)

yp_train = lr.predict(x_train)
yp_test = lr.predict(x_test)

evaluation_parametrics(y_train,yp_train,y_test,yp_test)

--------------------------------------------------------------------------
Classification Report for Train Data
              precision    recall  f1-score   support

           1       0.34      0.53      0.42       305
           2       0.34      0.17      0.23       299
           3       0.31      0.31      0.31       308
           4       0.40      0.37      0.39       297

    accuracy                           0.35      1209
   macro avg       0.35      0.35      0.34      1209
weighted avg       0.35      0.35      0.34      1209

Classification Report for Test Data
              precision    recall  f1-score   support

           1       0.31      0.47      0.37       127
           2       0.33      0.17      0.23       133
           3       0.25      0.26      0.25       124
           4       0.39      0.36      0.37       135

    accuracy                           0.31       519
   macro avg       0.32      0.31      0.31       519
weighted avg       0.32      0.31    

In [59]:
dt = DecisionTreeClassifier(max_depth = 7,random_state = 48) # Keeping max_depth = 7 to avoid overfitting
dt.fit(x_train,y_train)

yp_train = dt.predict(x_train)
yp_test = dt.predict(x_test)

evaluation_parametrics(y_train,yp_train,y_test,yp_test)

--------------------------------------------------------------------------
Classification Report for Train Data
              precision    recall  f1-score   support

           1       0.43      0.39      0.41       305
           2       0.43      0.38      0.40       299
           3       0.42      0.52      0.46       308
           4       0.46      0.45      0.46       297

    accuracy                           0.44      1209
   macro avg       0.44      0.44      0.43      1209
weighted avg       0.44      0.44      0.43      1209

Classification Report for Test Data
              precision    recall  f1-score   support

           1       0.25      0.19      0.21       127
           2       0.14      0.11      0.12       133
           3       0.10      0.15      0.12       124
           4       0.20      0.21      0.20       135

    accuracy                           0.16       519
   macro avg       0.17      0.16      0.17       519
weighted avg       0.17      0.16    

In [60]:
rf = RandomForestClassifier(max_depth = 7,random_state = 48) # Keeping max_depth = 7 same as DT
rf.fit(x_train,y_train)

yp_train = rf.predict(x_train)
yp_test = rf.predict(x_test)

evaluation_parametrics(y_train,yp_train,y_test,yp_test)

--------------------------------------------------------------------------
Classification Report for Train Data
              precision    recall  f1-score   support

           1       0.41      0.57      0.48       305
           2       0.43      0.39      0.41       299
           3       0.49      0.40      0.44       308
           4       0.52      0.46      0.49       297

    accuracy                           0.46      1209
   macro avg       0.46      0.46      0.45      1209
weighted avg       0.46      0.46      0.45      1209

Classification Report for Test Data
              precision    recall  f1-score   support

           1       0.17      0.20      0.19       127
           2       0.06      0.05      0.06       133
           3       0.07      0.08      0.08       124
           4       0.17      0.16      0.16       135

    accuracy                           0.12       519
   macro avg       0.12      0.12      0.12       519
weighted avg       0.12      0.12    

## Model Evaluation Summary

After running various predictive models, we are able to see the performance of each models summarized below.
<br>
As shown in the table below, both Decision Tree and Random Forest seem to be overfitting based on the scores in the Train vs Test dataset.
<br>
As such, we will pick the logistic regression because it performs the best as compared to the other two models.
<br>
However, the model performance is still low, so there is an opportunity to further fine tune the model.

![Model Summary](image/model_summary.PNG)

## Predicting the Buying-Price Class

Now, we are going to predict the buying-price class with the following parameters:
- Maintenance = High --> encoded into 1
- Number of doors = 4 --> encoded into 3
- Lug Boot Size = Big --> encoded into 3
- Safety = High --> encoded into 3
- Class Value = Good --> encoded into 4

In [70]:
# Reading from excel sheet that contains a dataframe with the same value as the given parameter above
evaluation_df = pd.read_excel('data/car_data_assignment.xlsx')
evaluation_df.drop('buying', inplace=True, axis=1)

evaluation_df_result = lr.predict(evaluation_df)
print(f'Buying-price class predicted for the given parameters above: {str(evaluation_df_result[0])} (low)')


Buying-price class predicted for the given parameters above: 4 (low)
