# Machine learning
To process information of data through machine learning method to predict if buying based on the following information

-Maintenance = High\
-Number of doors = 4\
-Lug Boot Size = Big\
-Safety = High\
-Class Value = Good

In [1]:
import pandas as pd
import numpy as np
from numpy import random
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import math

# Import Dataset

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data',
                 names= ['buying','maint','doors','persons','lug_boot','safety','Class'])

df.head

<bound method NDFrame.head of      buying  maint  doors persons lug_boot safety  Class
0     vhigh  vhigh      2       2    small    low  unacc
1     vhigh  vhigh      2       2    small    med  unacc
2     vhigh  vhigh      2       2    small   high  unacc
3     vhigh  vhigh      2       2      med    low  unacc
4     vhigh  vhigh      2       2      med    med  unacc
...     ...    ...    ...     ...      ...    ...    ...
1723    low    low  5more    more      med    med   good
1724    low    low  5more    more      med   high  vgood
1725    low    low  5more    more      big    low  unacc
1726    low    low  5more    more      big    med   good
1727    low    low  5more    more      big   high  vgood

[1728 rows x 7 columns]>

In [3]:
df.columns # seeing the columns that we have

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'Class'], dtype='object')

# Cleaning data to only contain information we need

In [4]:
df.drop(['persons'],axis = 1 ,inplace = True)

In [5]:
 df.columns # check if column is drop properly

Index(['buying', 'maint', 'doors', 'lug_boot', 'safety', 'Class'], dtype='object')

# Preparing X and Y data

In [6]:
x_data = df.drop(['buying'], axis = 1).copy() #all other independent variable

y_data = df['buying'].copy() # the dependant variable

In [7]:
x_data # view x data

Unnamed: 0,maint,doors,lug_boot,safety,Class
0,vhigh,2,small,low,unacc
1,vhigh,2,small,med,unacc
2,vhigh,2,small,high,unacc
3,vhigh,2,med,low,unacc
4,vhigh,2,med,med,unacc
...,...,...,...,...,...
1723,low,5more,med,med,good
1724,low,5more,med,high,vgood
1725,low,5more,big,low,unacc
1726,low,5more,big,med,good


In [8]:
y_data.to_frame() # view y data

Unnamed: 0,buying
0,vhigh
1,vhigh
2,vhigh
3,vhigh
4,vhigh
...,...
1723,low
1724,low
1725,low
1726,low


# Data Preprocessing

In [9]:
# setting up categories for ordinal categorical variable
maint_cat = ['low', 'med', 'high', 'vhigh']
doors_cat = ['2', '3', '4', '5more']
lug_boot_cat = ['small', 'med', 'big']
safety_cat = ['low', 'med', 'high']
class_cat = ['unacc', 'acc', 'good', 'vgood']
Ord_encoder = OrdinalEncoder(categories = [maint_cat,doors_cat,lug_boot_cat,safety_cat,class_cat])


In [10]:
ordinal_data = Ord_encoder.fit_transform(x_data,Ord_encoder)

In [11]:
# For dummy variable encoding using Hot
Hot_encoder = OneHotEncoder(sparse=False)

In [12]:
Hot_encoder.fit(x_data)
onehot_data = Hot_encoder.transform(x_data)

In [13]:
# ordinal encode target variable
label_encoder = LabelEncoder()
y_data = label_encoder.fit_transform(y_data)

# Seperating the data

In [14]:
x_train, x_test, y_train, y_test = train_test_split(ordinal_data, y_data, test_size=0.3, random_state=2022) # setting seed

# Trying out different models to find best accuracies to use

In [15]:
# Running with Ordinal category data with Random Forest
random.seed(2022) # set seed to make it reproducible
O_rf = RandomForestClassifier()
O_rf.fit(x_train,y_train)

y_pred=O_rf.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

[[11 21 31 54]
 [35 19 53 24]
 [44 36 17 31]
 [63 22 32 26]]
Accuracy: 0.14065510597302505


In [16]:
# Running with Ordinal category data with Decision Tree
O_dTree = DecisionTreeClassifier()
O_dTree.fit(x_train,y_train)
y_pred=O_dTree.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

[[24 31 24 38]
 [51 29 38 13]
 [66 39  3 20]
 [96 17 18 12]]
Accuracy: 0.13102119460500963


In [17]:
# Running with Ordinal category data with K nearest-neighbour
O_neigh = KNeighborsClassifier(n_neighbors=int(math.sqrt((len(x_data)))))
O_neigh.fit(x_train,y_train)
y_pred=O_neigh.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

[[38 11 29 39]
 [43 36 33 19]
 [51 30 23 24]
 [76 14 18 35]]
Accuracy: 0.2543352601156069


# Trying the 3 models with One hot Encoding Method

In [18]:
# setting seed to 2022 to match seperation from Ordinal
x_train_hot, x_test_hot, y_train_hot, y_test_hot = train_test_split(onehot_data, y_data, test_size=0.3, random_state=2022)

In [19]:
# Running with Dummy(Hot) category data with Random Forest
random.seed(2022) # set seed to make it reproducible
H_rf = RandomForestClassifier()
H_rf.fit(x_train_hot,y_train_hot)

y_pred_hot =H_rf.predict(x_test_hot)
print(confusion_matrix(y_test_hot, y_pred_hot))
print("Accuracy:",metrics.accuracy_score(y_test_hot, y_pred_hot))

[[10 22 31 54]
 [31 23 51 26]
 [41 41 14 32]
 [60 26 31 26]]
Accuracy: 0.14065510597302505


In [20]:
# Running with Dummy(Hot) category data with Decision Tree
H_dTree = DecisionTreeClassifier()
H_dTree.fit(x_train_hot,y_train_hot)

y_pred_hot=H_dTree.predict(x_test_hot)
print(confusion_matrix(y_test_hot, y_pred_hot))
print("Accuracy:",metrics.accuracy_score(y_test_hot, y_pred_hot))

[[24 31 24 38]
 [51 29 38 13]
 [66 39  3 20]
 [96 17 18 12]]
Accuracy: 0.13102119460500963


In [21]:
# Running with Dummy(Hot) category data with K nearest-neighbour
H_neigh = KNeighborsClassifier(n_neighbors=int(math.sqrt((len(x_data)))))
H_neigh.fit(x_train_hot,y_train_hot)

y_pred_hot=H_neigh.predict(x_test_hot)
print(confusion_matrix(y_test_hot, y_pred_hot))
print("Accuracy:",metrics.accuracy_score(y_test_hot, y_pred_hot))

[[50 15 18 34]
 [34 40 37 20]
 [46 28 31 23]
 [68 23 16 36]]
Accuracy: 0.302504816955684


# Selecting the best model with highest accuracy to predict

Based on the accuracy from the previous data, KNN with Hot encoding produce the best result therefore we will use it to predict the question with it

In [22]:
"""
-Maintenance = High
-Number of doors = 4
-Lug Boot Size = Big
-Safety = High
-Class Value = Good
"""
question = [{'maint':'high','doors':'4','lug_boot':'big','safety':'high','Class':'good'}]
question_df =  pd.DataFrame(question) # placing the question to a dataframe to do HOT encoding

question_onehot = Hot_encoder.transform(question_df) # encoding of question

In [23]:
H_neigh.predict(question_onehot)

array([1])

With the result of the highest accuracy model producing 1 which represents low, we can conclude that the buying price of the car with the given specification is low with an accuracy of 30%.