# Classifiying Car Prices
## Chris Meehan
## 11/15/2020
## COMP740 Machine Learning - Fall 2020

```project_classification.ipynb``` - Data Encoding, Scaling, and Classification

Original Data Source: https://www.kaggle.com/austinreese/craigslist-carstrucks-data

In [1]:
import numpy as np
import pandas as pd
import pickle as pk
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier

In [2]:
file = open('Data/vehicles_new', 'rb')
df = pk.load(file)
file.close()

In [3]:
df.head

<bound method NDFrame.head of         price  year manufacturer     model  condition cylinders    fuel  \
1        8750  2013      hyundai    sonata  excellent        4      gas   
2       10900  2013       toyota     prius       good        4   hybrid   
5       13995  2012         ford     f-150       good        6      gas   
6        7995  2010    chevrolet   equinox       good        4      gas   
7        8995  2011    chevrolet  traverse       good        6      gas   
...       ...   ...          ...       ...        ...       ...     ...   
423823   9584  2012       toyota     camry  excellent        4      gas   
423824   1000  2004         ford     f-150       fair        8      gas   
423825  11750  2013        honda  civic ex  excellent        4      gas   
423852   1600  2006      hyundai    sonata       fair        6      gas   
423854    700  1994         ford     f-150       fair        6      gas   

        odometer title_status transmission drive paint_color  
1     

In [4]:
df.columns

Index(['price', 'year', 'manufacturer', 'model', 'condition', 'cylinders',
       'fuel', 'odometer', 'title_status', 'transmission', 'drive',
       'paint_color'],
      dtype='object')

## One Hot Encoding

In [5]:
condition = pd.get_dummies(df['condition'])
df = df.drop('condition',axis=1)
df = df.join(condition)

In [6]:
paint = pd.get_dummies(df['paint_color'])
df = df.drop('paint_color',axis=1)
df = df.join(paint)

In [7]:
fuel = pd.get_dummies(df['fuel'])
df = df.drop('fuel',axis=1)
df = df.join(fuel)

In [8]:
title = pd.get_dummies(df['title_status'])
df = df.drop('title_status',axis=1)
df = df.join(title)

In [9]:
make = pd.get_dummies(df['manufacturer'])
df = df.drop('manufacturer',axis=1)
df = df.join(make)

In [10]:
model = pd.get_dummies(df['model'])
df = df.drop('model',axis=1)
df = df.join(model)

In [11]:
trans = pd.get_dummies(df['transmission'])
df = df.drop('transmission',axis=1)
df = df.join(trans)

In [12]:
drive = pd.get_dummies(df['drive'])
df = df.drop('drive',axis=1)
df = df.join(drive)

## Scaling

In [13]:
scl = StandardScaler()
odo_scale = df[['odometer']]
odo_scale = scl.fit_transform(odo_scale)
df['odometer'] = odo_scale

In [14]:
cyl_scale = df[['cylinders']]
cyl_scale = scl.fit_transform(cyl_scale)
df['cylinders'] = cyl_scale

In [15]:
df

Unnamed: 0,price,year,cylinders,odometer,excellent,fair,good,like new,new,black,...,x3,x5,xterra,yukon,yukon xl,automatic,manual,4wd,fwd,rwd
1,8750,2013,-1.071751,-0.228601,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,10900,2013,-1.071751,-0.214288,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
5,13995,2012,0.163814,0.477191,0,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
6,7995,2010,-1.071751,-0.103456,0,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
7,8995,2011,0.163814,0.402320,0,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423823,9584,2012,-1.071751,0.163253,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
423824,1000,2004,1.399380,0.394696,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
423825,11750,2013,-1.071751,-0.468875,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
423852,1600,2006,0.163814,0.271598,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


## Splitting Target

In [16]:
y = df[['price']]
X = df.drop(columns=['price'])

## Train/Test Split

In [17]:
train, test, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
print(train.shape)
print(train_y.shape)
print(test.shape)
print(test_y.shape)

(29053, 218)
(29053, 1)
(7264, 218)
(7264, 1)


## Model 1

In [19]:
lin_reg = LinearRegression()
lin_reg.fit(train, train_y)

LinearRegression()

In [20]:
predictions = lin_reg.predict(train)
lin_mse = mean_squared_error(train_y, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

5420.980512595291

In [21]:
lin_reg.score(train, train_y)

0.625656871393679

## Model 2

In [22]:
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(train, train_y)

DecisionTreeRegressor(random_state=42)

In [23]:
tree_reg.score(train, train_y)

0.9967437266932734

In [24]:
tree_reg.score(test, test_y)

0.8394390401668732

In [25]:
predictions = tree_reg.predict(train)
lin_mse = mean_squared_error(train_y, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

505.59566052944575

## Model 3

In [26]:
clf = RandomForestClassifier(max_depth=12, random_state=42)
clf.fit(train, train_y)

RandomForestClassifier(max_depth=12, random_state=42)

In [27]:
clf.score(train, train_y)

0.3959315733314976

Model 2 using a Decision Tree Regressor, seems to have had the best reults. Now I will try to fine tune this model and get better results. 

## Grid Search

In [28]:
# from sklearn.model_selection import GridSearchCV
# param_grid = {   
#         'pca__n_components': [1, 2],     
#         'clf__n_estimators': [1, 3, 5, 10],
# }
# grid_search = GridSearchCV(tree_reg, param_grid, cv=3) 

# # fine-tune the hyperparameters
# grid_search.fit(train, train_y)

# # get the best model
# final_model = grid_search.best_estimator_