# Assignment A5-1 Car Rental Data Classification

# Imports and setup

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

# Get Data

In [2]:
# importing and preparing our dataset
COLUMNS = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'values']
df = pd.read_csv('https://datsoftlyngby.github.io/soft2020spring/resources/504b3eff-car.data', names=COLUMNS)

# Exploratory Data Analysis

In [3]:
#print the head
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,values
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [4]:
df.shape

(1728, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   values    1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [6]:
df.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,values
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,vhigh,vhigh,2,more,small,high,unacc
freq,432,432,432,576,576,576,1210


In [7]:
df.groupby('values').describe()

Unnamed: 0_level_0,buying,buying,buying,buying,maint,maint,maint,maint,doors,doors,...,persons,persons,lug_boot,lug_boot,lug_boot,lug_boot,safety,safety,safety,safety
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,...,top,freq,count,unique,top,freq,count,unique,top,freq
values,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
acc,384,4,med,115,384,4,med,115,384,4,...,4,198,384,3,big,144,384,2,high,204
good,69,2,low,46,69,2,low,46,69,4,...,4,36,69,3,big,24,69,2,med,39
unacc,1210,4,vhigh,360,1210,4,vhigh,360,1210,4,...,2,576,1210,3,small,450,1210,3,low,576
vgood,65,2,low,39,65,3,low,26,65,4,...,more,35,65,2,big,40,65,1,high,65


# Pre-process data

In [8]:
df.columns[:-1]

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'], dtype='object')

In [9]:
# Encode to get labels as numbers
le = preprocessing.LabelEncoder()
dfEncoded = df.apply(le.fit_transform)

In [10]:
dataset = df.values

In [11]:
features, labels = dataset[:, :-1], dataset[:, -1]

# Train Model

In [12]:
# Split data
X = dfEncoded.loc[:,'buying':'safety'] #Gets all the rows in the dataset from column 'buying' to column 'safety'
y = df.loc[:,'values'] #Gets all of the rows in the dataset from column 'values'

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 0)

In [14]:
tree = DecisionTreeClassifier(criterion='entropy', random_state=0)
tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [15]:
rf = RandomForestClassifier(n_estimators = 100, max_depth = 6)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=6, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

# Final prediction

### Each attribute/feature described:
* buying (buying price): vhigh (4), high (3), med (2), low (1)
* main (maintenance price): vhigh (4), high (3), med (2), low (1)
* doors (number of doors): 2, 3, 4, 5-more (5)
* persons (number of passengers fit in a car): 2, 4, more (6)
* lug_boot (size of luggage capacity): small (1), med (2), big (3)
* safety: low (1), med (2), high (3)
* values: unacc = unaccepted 2, acc = accepted 0, good = good 1, vgood = very good 3

In [16]:
df.iloc[1690:1694]

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,values
1690,low,low,4,4,big,med,good
1691,low,low,4,4,big,high,vgood
1692,low,low,4,more,small,low,unacc
1693,low,low,4,more,small,med,acc


In [17]:
dfEncoded.iloc[1690:1694]

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,values
1690,1,1,2,1,0,2,1
1691,1,1,2,1,0,0,3
1692,1,1,2,2,2,1,2
1693,1,1,2,2,2,2,0


In [18]:
prediction1690 = [[1,1,2,1,0,2]]
prediction1691 = [[1,1,2,1,0,0]]
prediction1692 = [[1,1,2,2,2,1]]
prediction1693 = [[1,2,2,2,2,2]]

In [19]:
#Print the prediction
print('Predictions with DecisionTree: ')
print(tree.predict(prediction1690))
print(tree.predict(prediction1691))
print(tree.predict(prediction1692))
print(tree.predict(prediction1693))
print()
print('Predictions with RandomForest: ')
print(rf.predict(prediction1690))
print(rf.predict(prediction1691))
print(rf.predict(prediction1692))
print(rf.predict(prediction1693))
print()
print('Actual: ')
actual = [['good'], ['vgood'], ['unacc'], ['acc']]
print(actual[0])
print(actual[1])
print(actual[2])
print(actual[3])

Predictions with DecisionTree: 
['good']
['vgood']
['unacc']
['acc']

Predictions with RandomForest: 
['acc']
['vgood']
['unacc']
['unacc']

Actual: 
['good']
['vgood']
['unacc']
['acc']


In [20]:
# DecisionTree accuracy
accuracy_score(y, tree.predict(X))

0.9953703703703703

In [21]:
# RandomForest accuracy
accuracy_score(y, rf.predict(X))

0.9074074074074074