In [62]:
import numpy as np
import pandas as pd
import pickle as pk
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error



In [63]:
file = open('Data/vehicles_new', 'rb')
df = pk.load(file)
file.close()

In [64]:
df.head

<bound method NDFrame.head of         price  year manufacturer                model  condition    cylinders  \
1        8750  2013      hyundai               sonata  excellent  4 cylinders   
2       10900  2013       toyota                prius       good  4 cylinders   
5       13995  2012         ford                f-150       good  6 cylinders   
6        7995  2010    chevrolet              equinox       good  4 cylinders   
7        8995  2011    chevrolet             traverse       good  6 cylinders   
...       ...   ...          ...                  ...        ...          ...   
423843  29500  2015       toyota  tacoma trd off road   like new  6 cylinders   
423845   1600  2004        volvo                 xc70  excellent  5 cylinders   
423852   1600  2006      hyundai               sonata       fair  6 cylinders   
423853   9000  2003       toyota      sequoia limited  excellent  8 cylinders   
423854    700  1994         ford                f-150       fair  6 cylinders  

In [65]:
df.columns

Index(['price', 'year', 'manufacturer', 'model', 'condition', 'cylinders',
       'fuel', 'odometer', 'title_status', 'transmission', 'drive',
       'paint_color'],
      dtype='object')

In [66]:
condition = pd.get_dummies(df['condition'])
df = df.drop('condition',axis=1)
df = df.join(condition)

In [67]:
df['cylinders'] = df['cylinders'].str.strip('cylinders')

In [68]:
paint = pd.get_dummies(df['paint_color'])
df = df.drop('paint_color',axis=1)
df = df.join(paint)

In [69]:
fuel = pd.get_dummies(df['fuel'])
df = df.drop('fuel',axis=1)
df = df.join(fuel)

In [70]:
title = pd.get_dummies(df['title_status'])
df = df.drop('title_status',axis=1)
df = df.join(title)

In [71]:
make = pd.get_dummies(df['manufacturer'])
df = df.drop('manufacturer',axis=1)
df = df.join(make)

In [72]:
model = pd.get_dummies(df['model'])
df = df.drop('model',axis=1)
df = df.join(model)

In [73]:
trans = pd.get_dummies(df['transmission'])
df = df.drop('transmission',axis=1)
df = df.join(trans)

In [74]:
drive = pd.get_dummies(df['drive'])
df = df.drop('drive',axis=1)
df = df.join(drive)

In [75]:
scl = StandardScaler()
odo_scale = df[['odometer']]
odo_scale = scl.fit_transform(odo_scale)
df['odometer'] = odo_scale

In [76]:
cyl_scale = df[['cylinders']]
cyl_scale = scl.fit_transform(cyl_scale)
df['cylinders'] = cyl_scale

In [77]:
df

Unnamed: 0,price,year,cylinders,odometer,excellent,fair,good,like new,new,salvage,...,z4,z4 3.0i,z4 3.0si,zdx,zephyr,automatic,manual,4wd,fwd,rwd
1,8750,2013,-1.106740,-0.220969,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,10900,2013,-1.106740,-0.205092,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
5,13995,2012,0.133415,0.561928,0,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
6,7995,2010,-1.106740,-0.082152,0,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
7,8995,2011,0.133415,0.478877,0,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423843,29500,2015,0.133415,-0.347896,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
423845,1600,2004,-0.486663,1.395079,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
423852,1600,2006,0.133415,0.333874,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
423853,9000,2003,1.373569,0.334035,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0


In [78]:
y = df[['price']]
X = df.drop(columns=['price'])

In [79]:
train, test, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [80]:
train, valid, train_y, valid_y = train_test_split(train, train_y, test_size=0.02, random_state=42)

In [81]:
print(train.shape)
print(train_y.shape)
print(valid.shape)
print(valid_y.shape)
print(test.shape)
print(test_y.shape)

(53975, 2414)
(53975, 1)
(1102, 2414)
(1102, 1)
(13770, 2414)
(13770, 1)


In [82]:
lin_reg = LinearRegression()
lin_reg.fit(train, train_y)

LinearRegression()

In [83]:
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(train, train_y)

DecisionTreeRegressor(random_state=42)

In [84]:
predictions = lin_reg.predict(train)
lin_mse = mean_squared_error(train_y, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

10659625.859908745

In [85]:
lin_reg.score(train, train_y)

0.013630962397797597

In [87]:
tree_reg.score(valid, valid_y)

0.048106687192195796

In [93]:
predictions = tree_reg.predict(train)
lin_mse = mean_squared_error(train_y, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

726.3901179220035