# Classifiying Car Prices
## Chris Meehan
## 11/15/2020
## COMP740 Machine Learning - Fall 2020

```project_classification.ipynb``` - Data Encoding, Scaling, and Classification

Original Data Source: https://www.kaggle.com/austinreese/craigslist-carstrucks-data

In [66]:
import numpy as np
import pandas as pd
import pickle as pk
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from scipy.stats import randint

In [67]:
file = open('Data/vehicles_new', 'rb')
df = pk.load(file)
file.close()

In [68]:
df.head

<bound method NDFrame.head of         price  year manufacturer     model  condition cylinders    fuel  \
1        8750  2013      hyundai    sonata  excellent        4      gas   
2       10900  2013       toyota     prius       good        4   hybrid   
5       13995  2012         ford     f-150       good        6      gas   
6        7995  2010    chevrolet   equinox       good        4      gas   
7        8995  2011    chevrolet  traverse       good        6      gas   
...       ...   ...          ...       ...        ...       ...     ...   
423823   9584  2012       toyota     camry  excellent        4      gas   
423824   1000  2004         ford     f-150       fair        8      gas   
423825  11750  2013        honda  civic ex  excellent        4      gas   
423852   1600  2006      hyundai    sonata       fair        6      gas   
423854    700  1994         ford     f-150       fair        6      gas   

        odometer title_status transmission drive paint_color  
1     

In [69]:
df.columns

Index(['price', 'year', 'manufacturer', 'model', 'condition', 'cylinders',
       'fuel', 'odometer', 'title_status', 'transmission', 'drive',
       'paint_color'],
      dtype='object')

## One Hot Encoding

In [70]:
df

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,paint_color
1,8750,2013,hyundai,sonata,excellent,4,gas,90821.0,clean,automatic,fwd,grey
2,10900,2013,toyota,prius,good,4,hybrid,92800.0,clean,automatic,fwd,blue
5,13995,2012,ford,f-150,good,6,gas,188406.0,clean,automatic,4wd,grey
6,7995,2010,chevrolet,equinox,good,4,gas,108124.0,clean,automatic,4wd,grey
7,8995,2011,chevrolet,traverse,good,6,gas,178054.0,clean,automatic,4wd,white
...,...,...,...,...,...,...,...,...,...,...,...,...
423823,9584,2012,toyota,camry,excellent,4,gas,145000.0,clean,automatic,fwd,grey
423824,1000,2004,ford,f-150,fair,8,gas,177000.0,clean,automatic,4wd,blue
423825,11750,2013,honda,civic ex,excellent,4,gas,57600.0,clean,automatic,fwd,silver
423852,1600,2006,hyundai,sonata,fair,6,gas,159980.0,clean,automatic,fwd,blue


In [71]:
condition = pd.get_dummies(df['condition'])
df = df.drop('condition',axis=1)
df = df.join(condition)

In [72]:
paint = pd.get_dummies(df['paint_color'])
df = df.drop('paint_color',axis=1)
df = df.join(paint)

In [73]:
fuel = pd.get_dummies(df['fuel'])
df = df.drop('fuel',axis=1)
df = df.join(fuel)

In [74]:
title = pd.get_dummies(df['title_status'])
df = df.drop('title_status',axis=1)
df = df.join(title)

In [75]:
make = pd.get_dummies(df['manufacturer'])
df = df.drop('manufacturer',axis=1)
df = df.join(make)

In [76]:
model = pd.get_dummies(df['model'])
df = df.drop('model',axis=1)
df = df.join(model)

In [77]:
trans = pd.get_dummies(df['transmission'])
df = df.drop('transmission',axis=1)
df = df.join(trans)

In [78]:
drive = pd.get_dummies(df['drive'])
df = df.drop('drive',axis=1)
df = df.join(drive)

In [79]:
df

Unnamed: 0,price,year,cylinders,odometer,excellent,fair,good,like new,new,black,...,x3,x5,xterra,yukon,yukon xl,automatic,manual,4wd,fwd,rwd
1,8750,2013,4,90821.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,10900,2013,4,92800.0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
5,13995,2012,6,188406.0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
6,7995,2010,4,108124.0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
7,8995,2011,6,178054.0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423823,9584,2012,4,145000.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
423824,1000,2004,8,177000.0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
423825,11750,2013,4,57600.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
423852,1600,2006,6,159980.0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


## Scaling

In [80]:
scl = StandardScaler()
odo_scale = df[['odometer']]
odo_scale = scl.fit_transform(odo_scale)
df['odometer'] = odo_scale

In [81]:
cyl_scale = df[['cylinders']]
cyl_scale = scl.fit_transform(cyl_scale)
df['cylinders'] = cyl_scale

In [82]:
df

Unnamed: 0,price,year,cylinders,odometer,excellent,fair,good,like new,new,black,...,x3,x5,xterra,yukon,yukon xl,automatic,manual,4wd,fwd,rwd
1,8750,2013,-1.071751,-0.228601,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,10900,2013,-1.071751,-0.214288,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
5,13995,2012,0.163814,0.477191,0,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
6,7995,2010,-1.071751,-0.103456,0,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
7,8995,2011,0.163814,0.402320,0,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423823,9584,2012,-1.071751,0.163253,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
423824,1000,2004,1.399380,0.394696,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
423825,11750,2013,-1.071751,-0.468875,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
423852,1600,2006,0.163814,0.271598,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


## Splitting Target

In [83]:
y = df[['price']]
X = df.drop(columns=['price'])

## Train/Test Split

In [84]:
train, test, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [85]:
print(train.shape)
print(train_y.shape)
print(test.shape)
print(test_y.shape)

(29053, 218)
(29053, 1)
(7264, 218)
(7264, 1)


## Model 1

In [86]:
lin_reg = LinearRegression()
lin_reg.fit(train, train_y)

LinearRegression()

In [87]:
predictions = lin_reg.predict(train)
lin_mse = mean_squared_error(train_y, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

5420.980512595291

In [88]:
lin_reg.score(train, train_y)

0.625656871393679

## Model 2

In [89]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(train, train_y)

DecisionTreeRegressor()

In [90]:
tree_reg.score(train, train_y)

0.9967437266932734

In [91]:
tree_reg.score(test, test_y)

0.845564470006375

In [92]:
predictions = tree_reg.predict(train)
lin_mse = mean_squared_error(train_y, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

505.59566052944575

## Model 3

In [93]:
clf = RandomForestClassifier(max_depth=12, random_state=42)
clf.fit(train, train_y)

RandomForestClassifier(max_depth=12, random_state=42)

In [94]:
clf.score(train, train_y)

0.3959315733314976

Model 2 using a Decision Tree Regressor, seems to have had the best reults. Now I will try to fine tune this model and get better results. 

## Grid Search

In [95]:
param_grid = {   
        'max_depth': [16, 32, 64],     
        'min_samples_split': [5, 10, 15],
        'min_samples_leaf' : [1, 2, 3], 
}
grid_search = GridSearchCV(tree_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train, train_y)
final_model = grid_search.best_estimator_

In [96]:
final_model

DecisionTreeRegressor(max_depth=32, min_samples_leaf=2, min_samples_split=15)

In [97]:
final_model.score(test, test_y)

0.850873700687546

In [98]:
scores = cross_val_score(final_model, test, test_y,
                         scoring="neg_mean_squared_error", cv=10)
clf_rmse_scores = np.sqrt(-scores)

In [99]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [100]:
display_scores(clf_rmse_scores)

Scores: [4433.88805934 4226.49209585 4271.6402489  4316.20419788 4248.78603207
 4458.72230499 4452.71161883 4482.97362977 4502.1960544  3922.75140215]
Mean: 4331.636564417373
Standard deviation: 167.9890267721334


## Random Search

In [101]:
param_distribs = {
        'max_depth': randint(low=1, high=200),
        'min_samples_split': randint(low=1, high=8),
        'min_samples_leaf' : randint(low=1, high=8)
    }

tree_reg = DecisionTreeRegressor(random_state=42)
rnd_search = RandomizedSearchCV(tree_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, random_state=42)
rnd_search.fit(train, train_y)

RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=42),
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001B2409A7730>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001B2407DEB20>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001B2407E4970>},
                   random_state=42)

In [102]:
rnd_search.score(test, test_y)

0.8356933136105672

In [103]:
rnd_search.best_estimator_

DecisionTreeRegressor(max_depth=15, min_samples_leaf=3, min_samples_split=5,
                      random_state=42)

## Throw it all together into a pipeline

In [104]:
numeric_features = ['odometer', 'cylinders']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [105]:
categorical_features = ['manufacturer', 'model', 'condition', 'fuel', 'title_status', 'transmission', 'drive', 'paint_color']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder())])

In [118]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [119]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', DecisionTreeRegressor(max_depth=32, min_samples_leaf=2, min_samples_split=15,
                      random_state=42))])

## Grab clean data from the pickle file

In [108]:
file = open('Data/vehicles_new', 'rb')
data = pk.load(file)
file.close()

In [109]:
data

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,paint_color
1,8750,2013,hyundai,sonata,excellent,4,gas,90821.0,clean,automatic,fwd,grey
2,10900,2013,toyota,prius,good,4,hybrid,92800.0,clean,automatic,fwd,blue
5,13995,2012,ford,f-150,good,6,gas,188406.0,clean,automatic,4wd,grey
6,7995,2010,chevrolet,equinox,good,4,gas,108124.0,clean,automatic,4wd,grey
7,8995,2011,chevrolet,traverse,good,6,gas,178054.0,clean,automatic,4wd,white
...,...,...,...,...,...,...,...,...,...,...,...,...
423823,9584,2012,toyota,camry,excellent,4,gas,145000.0,clean,automatic,fwd,grey
423824,1000,2004,ford,f-150,fair,8,gas,177000.0,clean,automatic,4wd,blue
423825,11750,2013,honda,civic ex,excellent,4,gas,57600.0,clean,automatic,fwd,silver
423852,1600,2006,hyundai,sonata,fair,6,gas,159980.0,clean,automatic,fwd,blue


In [110]:
y = data[['price']]
X = data.drop(columns=['price'])

In [111]:
train, test, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [120]:
clf.fit(train, train_y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  ['odometer', 'cylinders']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   One

In [121]:
clf.score(train,train_y)

0.8728292777429425

In [122]:
clf.score(test,test_y)

0.7134572842715283

In [115]:
scores = cross_val_score(clf, train, train_y,
                         scoring="neg_mean_squared_error", cv=10)
clf_rmse_scores = np.sqrt(-scores)

In [116]:
display_scores(clf_rmse_scores)

Scores: [4946.00717097 5117.67314542 4795.67320131 5122.21472662 5047.08115023
 4822.5015245  4938.80748139 4861.83120753 5069.2617497  5122.95291804]
Mean: 4984.400427571452
Standard deviation: 121.44410289962335
