model
---

This notebook creates a linear regression model from the training data and applies it to the test data.

In [46]:
#import libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score#, train_test_split

In [47]:
train_data_path = '../data/train_processed.csv'
test_data_path = '../data/test_processed.csv'

traindf = pd.read_csv(train_data_path)
testdf = pd.read_csv(test_data_path)
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5971 entries, 0 to 5970
Columns: 244 entries, SellerIsPriv to VehYear_2016
dtypes: float64(244)
memory usage: 11.1 MB


In [61]:
testdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 244 entries, SellerIsPriv to Dealer_Listing_Price
dtypes: float64(244)
memory usage: 1.9 MB


In [48]:
#just to double-check, the training and testing data columns are the same.

traincols = traindf.columns.to_list()
traincols.remove('Dealer_Listing_Price')

traincols == testdf.columns.to_list()

True

# predicting dealer listing price

In [49]:
###### train-test split the training data to check
X = traindf.drop(columns = 'Dealer_Listing_Price')
y = traindf['Dealer_Listing_Price']

# actually, no need to tts. Just use cross_val_score. The linear fit is the same.
# Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state = 1916)

In [50]:
#instantiate and fit a linear model to the training data using 5 cv folds

lr = LinearRegression()
cross_val_score(lr, X, y, cv = 5)

array([0.80326272, 0.80303177, 0.82342441, 0.81447254, 0.77250683])

In [51]:
# the fit is O.K. Use the linear regression on the test set and save.

In [52]:
lr.fit(X, y)
preds = lr.predict(testdf)

In [56]:
#the mean and std of the predicted values for the test set are similar to the dealer listing prices in the training set.
preds.mean(), preds.std(), y.mean(), y.std()

(32426.302066692955, 6912.640352068063, 32038.09613130129, 7524.55733995572)

In [58]:
# look at some predictions
testdf['Dealer_Listing_Price'] = preds
testdf['Dealer_Listing_Price']

0      43402.905300
1      21655.212581
2      22108.147940
3      24701.152495
4      40854.385239
           ...     
995    33344.387246
996    35429.419922
997    42498.278455
998    46828.779463
999    39383.038540
Name: Dealer_Listing_Price, Length: 1000, dtype: float64

In [59]:
testdf

Unnamed: 0,SellerIsPriv,SellerRating,SellerRevCnt,VehCertified,VehListdays,VehMileage,SellerCity_Chicago,SellerCity_Battle Creek,SellerCity_Columbus,SellerCity_Louisville,...,VehMake_Jeep,VehPriceLabel_Fair Price,VehPriceLabel_Good Deal,VehPriceLabel_Great Deal,VehYear_2015,VehYear_2017,VehYear_2018,VehYear_2019,VehYear_2016,Dealer_Listing_Price
0,0.0,2.5,59.0,0.0,143.991262,13625.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,43402.905300
1,0.0,4.7,2116.0,0.0,138.770486,42553.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,21655.212581
2,0.0,3.9,46.0,1.0,31.951088,48951.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,22108.147940
3,0.0,4.5,1075.0,0.0,5.950127,44179.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,24701.152495
4,0.0,4.6,162.0,0.0,24.672986,22269.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,40854.385239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,4.8,1081.0,0.0,18.091597,24744.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,33344.387246
996,0.0,4.0,240.0,0.0,167.799676,5699.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,35429.419922
997,0.0,5.0,134.0,0.0,46.215625,17985.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,42498.278455
998,0.0,3.8,7.0,0.0,14.907535,27.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,46828.779463


# predicting trim

In [98]:
#imports
from keras.layers import Dense
from keras.models import Sequential

In [66]:
#load the trim data
trimdata = pd.read_csv('../data/train_trim_data.csv', index_col='Unnamed: 0')
trimdata.head()

Unnamed: 0,Vehicle_Trim
0,High Altitude
1,
2,Laredo
3,Limited
4,Luxury


In [67]:
#slap the trim back on
traindf = traindf.join(trimdata)

In [71]:
#remove nulls. A lot of the trims are missing.
traindf.dropna(inplace=True)

In [78]:
# #get unique trims
# trims = list(traindf['Vehicle_Trim'].unique())

In [133]:
# trims.index('Limited')

In [135]:
# #turn trim strings into a vector and vice versa.

# def encode_trim(some_trim_string, unique_trim_list):
#     encoded_trim_vector = np.full(len(unique_trim_list), 0)
#     trim_index = list(unique_trim_list).index(some_trim_string)
#     encoded_trim_vector[trim_index] = 1
#     return encoded_trim_vector

# def decode_trim(trim_vector, unique_trim_list):
#     trim_index = list(trim_vector).index(1)
#     decoded_trim_string = unique_trim_list[trim_index]
#     return decoded_trim_string


In [134]:
# #test them out
# test_decode = np.full(len(trims), 0)
# test_decode[5] = 1

# test_encode = decode_trim(test_decode, trims)
# test_encode

In [136]:
encode_trim(test_encode, trims)

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

In [137]:
#turn all training trims into vectors

train_trims = pd.get_dummies(traindf['Vehicle_Trim'])

In [150]:
trims = train_trims.columns

In [151]:
num_inputs = len(testdf.columns)
num_outputs = len(trims)

#create a model
model = Sequential()
model.add(Dense(num_inputs, activation='relu'))
model.add(Dense(num_inputs, activation='relu'))
model.add(Dense(num_outputs, activation = 'softmax'))

model.compile(optimizer = 'adam', loss='categorical_crossentropy', metrics = ['accuracy'])

In [152]:
#train it!

X = traindf.drop(columns = ['Dealer_Listing_Price','Vehicle_Trim'])
y = train_trims

In [154]:
epochs = 256
batch_size = 256
history = model.fit(
    X,
    y,
    batch_size = batch_size,
    epochs=epochs,
    validation_split = 0.1,
    verbose = False
)