# <font color='green'>Car Price Prediction</font>

In [1]:
import pandas as pd
import numpy as np

### <font color='red'>Project Steps</font>

<img src="processes.png" width=600 height=60 />

<img src="cars.jpg" width=600 height=60 />

### <font color='blue'>Table of Contents</font>

### Read the Data File

In [2]:
df = pd.read_csv('datasets_383055_741735_CarPrice_Assignment.csv')

In [3]:
df.head() # take a look at the first few samples

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


### Data Wrangling

In [4]:
#Check for missing data

df.isna().sum()

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

* From above, this shows that there is no missing data

In [5]:
# Data Columns
df.columns

Index(['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'],
      dtype='object')

In [6]:
df.describe()

Unnamed: 0,car_ID,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,103.0,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,3.329756,3.255415,10.142537,104.117073,5125.121951,25.219512,30.75122,13276.710571
std,59.322565,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,0.270844,0.313597,3.97204,39.544167,476.985643,6.542142,6.886443,7988.852332
min,1.0,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,52.0,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,3.15,3.11,8.6,70.0,4800.0,19.0,25.0,7788.0
50%,103.0,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,3.31,3.29,9.0,95.0,5200.0,24.0,30.0,10295.0
75%,154.0,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,3.58,3.41,9.4,116.0,5500.0,30.0,34.0,16503.0
max,205.0,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,3.94,4.17,23.0,288.0,6600.0,49.0,54.0,45400.0


* Really don't know the meaning of symboling, but a negative minimum value looks suspicious

In [7]:
# Check the data type of each columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

* The car_ID will be dropped because it's irrelevant

In [8]:
df.drop(['car_ID'], axis = 1, inplace = True)

In [9]:
# Validate
df.head()

Unnamed: 0,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


### Exploratory Data Analysis

In [10]:
# Answer bunch of questions here
# Make graphs here

In [11]:
import data_preprocessing

In [12]:
# Create the Make and Model columns from CarName
from data_preprocessing import Make_Model as MM
mm = MM()
df1 = mm.fit_transform(df)

In [13]:
df1.head()

Unnamed: 0,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,Make,Model
0,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,...,3.47,2.68,9.0,111,5000,21,27,13495.0,alfa-romero,giulia
1,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,...,3.47,2.68,9.0,111,5000,21,27,16500.0,alfa-romero,stelvio
2,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,...,2.68,3.47,9.0,154,5000,19,26,16500.0,alfa-romero,Quadrifoglio
3,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,...,3.19,3.4,10.0,102,5500,24,30,13950.0,audi,100 ls
4,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,...,3.19,3.4,8.0,115,5500,18,22,17450.0,audi,100ls


In [14]:
# Write a class that can do all your visualization

In [15]:
df.head()

Unnamed: 0,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,Make,Model
0,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,...,3.47,2.68,9.0,111,5000,21,27,13495.0,alfa-romero,giulia
1,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,...,3.47,2.68,9.0,111,5000,21,27,16500.0,alfa-romero,stelvio
2,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,...,2.68,3.47,9.0,154,5000,19,26,16500.0,alfa-romero,Quadrifoglio
3,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,...,3.19,3.4,10.0,102,5500,24,30,13950.0,audi,100 ls
4,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,...,3.19,3.4,8.0,115,5500,18,22,17450.0,audi,100ls


### Build the Model

In [16]:
# Drop CarName and Model columns
df.drop(['CarName', 'Model'], axis = 1, inplace = True)

In [35]:
one = []
label = []
for col in df.columns:
    if len(df[col].unique()) > 2:
        label.append(col)
    else:
        one.append(col)

In [36]:
one

['fueltype', 'aspiration', 'doornumber', 'enginelocation']

In [37]:
label

['symboling',
 'carbody',
 'drivewheel',
 'wheelbase',
 'carlength',
 'carwidth',
 'carheight',
 'curbweight',
 'enginetype',
 'cylindernumber',
 'enginesize',
 'fuelsystem',
 'boreratio',
 'stroke',
 'compressionratio',
 'horsepower',
 'peakrpm',
 'citympg',
 'highwaympg',
 'price',
 'Make']

##### Generate four datasets
   * Based on One_Hot_Encoding 
   * Based on Label Encoder
   * Based on Embedding layer
   * And mixture of One_Hot_Encoding for Binary Encode and Embedding layer

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   symboling         205 non-null    int64  
 1   fueltype          205 non-null    object 
 2   aspiration        205 non-null    object 
 3   doornumber        205 non-null    object 
 4   carbody           205 non-null    object 
 5   drivewheel        205 non-null    object 
 6   enginelocation    205 non-null    object 
 7   wheelbase         205 non-null    float64
 8   carlength         205 non-null    float64
 9   carwidth          205 non-null    float64
 10  carheight         205 non-null    float64
 11  curbweight        205 non-null    int64  
 12  enginetype        205 non-null    object 
 13  cylindernumber    205 non-null    object 
 14  enginesize        205 non-null    int64  
 15  fuelsystem        205 non-null    object 
 16  boreratio         205 non-null    float64
 1

In [18]:
# Get all the columns to encode
colEncode = []
for col in df.columns:
    if df[col].dtype == object:
        colEncode.append(col)

In [19]:
# Validate
colEncode

['fueltype',
 'aspiration',
 'doornumber',
 'carbody',
 'drivewheel',
 'enginelocation',
 'enginetype',
 'cylindernumber',
 'fuelsystem',
 'Make']

In [20]:
# Based on One_Hot_Encoding
dfOneHot = pd.get_dummies(data = df, columns = colEncode)

In [21]:
dfOneHot.head()

Unnamed: 0,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,...,Make_porsche,Make_renault,Make_saab,Make_subaru,Make_toyota,Make_toyouta,Make_vokswagen,Make_volkswagen,Make_volvo,Make_vw
0,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,...,0,0,0,0,0,0,0,0,0,0
1,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,...,0,0,0,0,0,0,0,0,0,0
2,1,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,...,0,0,0,0,0,0,0,0,0,0
3,2,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,...,0,0,0,0,0,0,0,0,0,0
4,2,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Based on Label Encoder
from data_preprocessing import LabelEncode
le = LabelEncode(df, columns = colEncode)
dfLabelEncode = le.LabelEncode()

In [23]:
df.head()

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,Make
0,3,1,0,1,0,2,0,88.6,168.8,64.1,...,5,3.47,2.68,9.0,111,5000,21,27,13495.0,1
1,3,1,0,1,0,2,0,88.6,168.8,64.1,...,5,3.47,2.68,9.0,111,5000,21,27,16500.0,1
2,1,1,0,1,2,2,0,94.5,171.2,65.5,...,5,2.68,3.47,9.0,154,5000,19,26,16500.0,1
3,2,1,0,0,3,1,0,99.8,176.6,66.2,...,5,3.19,3.4,10.0,102,5500,24,30,13950.0,2
4,2,1,0,0,3,0,0,99.4,176.6,66.4,...,5,3.19,3.4,8.0,115,5500,18,22,17450.0,2


In [24]:
df['symboling'].unique()

array([ 3,  1,  2,  0, -1, -2], dtype=int64)

In [25]:
dfLabelEncode.head()

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,Make
0,3,1,0,1,0,2,0,88.6,168.8,64.1,...,5,3.47,2.68,9.0,111,5000,21,27,13495.0,1
1,3,1,0,1,0,2,0,88.6,168.8,64.1,...,5,3.47,2.68,9.0,111,5000,21,27,16500.0,1
2,1,1,0,1,2,2,0,94.5,171.2,65.5,...,5,2.68,3.47,9.0,154,5000,19,26,16500.0,1
3,2,1,0,0,3,1,0,99.8,176.6,66.2,...,5,3.19,3.4,10.0,102,5500,24,30,13950.0,2
4,2,1,0,0,3,0,0,99.4,176.6,66.4,...,5,3.19,3.4,8.0,115,5500,18,22,17450.0,2


In [26]:
# Based on Embedding layer

In [27]:
# Mixture of One_Hot_Encoding for Binary Encode and Embedding layer

##### Scale the DataSets

In [28]:
# Scale with Normalization, StandardScaler 

In [29]:
from data_preprocessing import NormalizationScaler, StandardScaler
ns = NormalizationScaler()
sc = StandardScaler()

In [30]:
# for One_Hot_Encode data
for col in dfOneHot:
    dfOneHot[col] = ns.fit_transform(dfOneHot[col])
    
#for col in dfOneHot:
#    dfOneHot[col] = sc.fit_transform(dfOneHot[col])

In [31]:
# for One_Hot_Encode data
#for col in dfLabelEncode:
#    dfLabelEncode[col] = ns.fit_transform(dfLabelEncode[col])
    
for col in dfLabelEncode:
    dfLabelEncode[col] = sc.fit_transform(dfLabelEncode[col])

In [32]:
dfOneHot

Unnamed: 0,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,...,Make_porsche,Make_renault,Make_saab,Make_subaru,Make_toyota,Make_toyouta,Make_vokswagen,Make_volkswagen,Make_volvo,Make_vw
0,1.0,0.058309,0.413433,0.316667,0.083333,0.411171,0.260377,0.664286,0.290476,0.12500,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.058309,0.413433,0.316667,0.083333,0.411171,0.260377,0.664286,0.290476,0.12500,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.6,0.230321,0.449254,0.433333,0.383333,0.517843,0.343396,0.100000,0.666667,0.12500,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.8,0.384840,0.529851,0.491667,0.541667,0.329325,0.181132,0.464286,0.633333,0.18750,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.8,0.373178,0.529851,0.508333,0.541667,0.518231,0.283019,0.464286,0.633333,0.06250,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,0.2,0.655977,0.711940,0.716667,0.641667,0.567882,0.301887,0.885714,0.514286,0.15625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
201,0.2,0.655977,0.711940,0.708333,0.641667,0.605508,0.301887,0.885714,0.514286,0.10625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
202,0.2,0.655977,0.711940,0.716667,0.641667,0.591156,0.422642,0.742857,0.380952,0.11250,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
203,0.2,0.655977,0.711940,0.716667,0.641667,0.670675,0.316981,0.335714,0.633333,1.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [33]:
dfOneHot.describe()

Unnamed: 0,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,...,Make_porsche,Make_renault,Make_saab,Make_subaru,Make_toyota,Make_toyouta,Make_vokswagen,Make_volkswagen,Make_volvo,Make_vw
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,...,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,0.566829,0.354419,0.49178,0.467317,0.49374,0.414106,0.248707,0.564111,0.564483,0.196409,...,0.019512,0.009756,0.029268,0.058537,0.15122,0.004878,0.004878,0.043902,0.053659,0.009756
std,0.249061,0.175562,0.184139,0.178767,0.203627,0.201971,0.157142,0.19346,0.149332,0.248253,...,0.138655,0.098531,0.16897,0.23533,0.35914,0.069843,0.069843,0.20538,0.225894,0.098531
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.4,0.230321,0.376119,0.316667,0.35,0.254849,0.135849,0.435714,0.495238,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.6,0.303207,0.479104,0.433333,0.525,0.359193,0.222642,0.55,0.580952,0.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.8,0.460641,0.626866,0.55,0.641667,0.561288,0.301887,0.742857,0.638095,0.15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [34]:
dfLabelEncode.describe()

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,Make
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,...,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,-4.9824640000000004e-17,7.906954e-17,9.531671e-17,-5.957294e-17,-2.919074e-16,-1.64638e-16,-1.371532e-16,-2.013999e-14,-1.01144e-14,1.430401e-14,...,1.635548e-16,-5.910719e-15,1.775138e-14,-5.121919e-16,1.775003e-16,3.103209e-16,1.223953e-16,1.792604e-16,1.353931e-16,3.4660620000000004e-17
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.275862,-3.033954,-0.4681493,-0.8824914,-3.043525,-2.385652,-0.1215691,-2.018771,-2.670706,-2.614113,...,-1.61616,-2.915911,-3.780057,-0.7911643,-1.419099,-2.044342,-1.867815,-2.142067,-1.021262,-1.932473
25%,-0.669832,0.327995,-0.4681493,-0.8824914,-0.7154554,-0.5876422,-0.1215691,-0.7068655,-0.6281176,-0.8427194,...,-1.119439,-0.6636894,-0.463699,-0.3883487,-0.8627587,-0.6816179,-0.9506844,-0.8351509,-0.6870462,-0.9975105
50%,0.133183,0.327995,-0.4681493,-0.8824914,0.4485792,-0.5876422,-0.1215691,-0.2917055,-0.06883752,-0.1901008,...,0.867444,-0.0729428,0.110286,-0.2876448,-0.2305542,0.1569818,-0.1864087,-0.1090867,-0.3732339,-0.06254802
75%,0.9361979,0.327995,-0.4681493,1.127628,0.4485792,1.210367,-0.1215691,0.6050399,0.7336078,0.4625179,...,0.867444,0.9239421,0.4929427,-0.1869408,0.3004976,0.7859315,0.7307221,0.4717647,0.4038489,1.005981
max,1.739213,0.327995,2.125651,1.127628,1.612614,1.210367,8.185651,3.677223,2.759985,2.979761,...,1.860885,2.253122,2.916435,3.236992,4.650065,3.092081,3.63497,3.376022,4.021014,1.673811
