In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
car_sales = pd.read_csv("data/car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [3]:
len(car_sales)

1000

In [4]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

#### Trying without converting to numerical

In [5]:
# Splitting to X and y
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

X.shape, y.shape

((1000, 4), (1000,))

In [6]:
# Splitting X and y to trainig and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [7]:
# Building the machine learning model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)

# Error because the machine learning model deals only with numbers

ValueError: could not convert string to float: 'Toyota'

#### Converting to numerical (numbers)

In [8]:
#Importing required modules

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [9]:
# Find the categorical features
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [10]:
categorical_features = ["Make", "Colour", "Doors"]

#Why doors?

In [11]:
car_sales["Doors"].value_counts()

# Cars with a specific number of doors can be in a group eg.
# Cars with 4 doors are 856; with 5 doors are 79; and with 3 doors are 65

4    856
5     79
3     65
Name: Doors, dtype: int64

In [12]:
# Call the OneHotEncoder and ColumnTransformer and store them in variables
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder = "passthrough")

In [13]:
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [14]:
transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [15]:
pd.DataFrame(transformed_X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0


In [16]:
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


##### What OneHotEncoder does
<img src="data/onehot.png"/>

In [17]:
car_sales["Make"].value_counts()

Toyota    398
Honda     304
Nissan    198
BMW       100
Name: Make, dtype: int64

In [18]:
car_sales["Colour"].value_counts()

White    407
Blue     321
Black     99
Red       94
Green     79
Name: Colour, dtype: int64

In [19]:
car_sales["Odometer (KM)"].value_counts()

73869     2
129188    2
35431     1
84787     1
213077    1
         ..
187663    1
65435     1
80664     1
119439    1
248360    1
Name: Odometer (KM), Length: 998, dtype: int64

In [20]:
car_sales["Doors"].value_counts()

4    856
5     79
3     65
Name: Doors, dtype: int64

In [28]:
## Converting to numerical option 2
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors", "Odometer (KM)"]])
dummies

Unnamed: 0,Doors,Odometer (KM),Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,35431,0,1,0,0,0,0,0,0,1
1,5,192714,1,0,0,0,0,1,0,0,0
2,4,84714,0,1,0,0,0,0,0,0,1
3,4,154365,0,0,0,1,0,0,0,0,1
4,3,181577,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
995,4,35820,0,0,0,1,1,0,0,0,0
996,3,155144,0,0,1,0,0,0,0,0,1
997,4,66604,0,0,1,0,0,1,0,0,0
998,4,215883,0,1,0,0,0,0,0,0,1


In [22]:
## Refitting the model with the numerical values

In [29]:
np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(dummies, y, test_size = 0.2)
model.fit(X_train, y_train)

RandomForestRegressor()

In [41]:
model.score(X_test, y_test)

0.3218004711970015

In [55]:
score = (model.score(X_test, y_test))*100
print("%.2f" % score)

32.18
