In [1]:
import pandas as pd
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import train_test_split
import time

# import data
df = pd.read_csv('../../data/aggregated/processed_data/data-4-7-18-full.csv')
df = df.sample(frac=0.2)

# replace missing values with mean/mode
# Continuous
df['mileage'] = df['mileage'].fillna((df['mileage'].mean()))
df['year'] = df['year'].fillna((df['year'].mean()))
# Categorical
df['make'] = df['make'].fillna(df['make'].value_counts().index[0])
df['model'] = df['model'].fillna(df['model'].value_counts().index[0])
df['state'] = df['state'].fillna(df['state'].value_counts().index[0])
df['transmission'] = df['transmission'].fillna(df['transmission'].value_counts().index[0])

# choose features and train/test split
features = df[['make', 'model', 'mileage', 'state', 'transmission', 'year']]
labels = df[['price']]
features_encoded = pd.get_dummies(features, columns=['make', 'model', 'state', 'transmission'])
X_train, X_test, Y_train, Y_test = train_test_split(features_encoded, labels, test_size=0.2, train_size=0.8)

df.head(5)

Unnamed: 0,listing_id,vin,make,model,year,mileage,transmission,exterior_color,state,price,source
14433,728760404,KMHD35LH1EU234022,Hyundai,Elantra,2014,41874.0,Automatic,White,NV,12200,Cars.com
13133,730092496,KNDPBCA27B7049572,Kia,Sportage,2011,65241.0,Automatic,Blue,OH,10499,Cars.com
28578,728686244,JTDKDTB36D1051010,Toyota,Prius C,2013,87145.0,Automatic,Blue,NH,9995,Cars.com
9034,729490176,1FA6P0H70G5124411,Ford,Fusion,2016,21085.0,Automatic,Gray,TN,15998,Cars.com
39209,5801eeb2-1d37-4afa-b8e2-49ec9acfd4ef,1C4RDJEG0EC464564,Dodge,Durango,2014,77767.0,Automatic,Brilliant Black Crystal Pearlcoat,UT,22493,KSL.com


## First Run

In [2]:
model = KernelRidge()
stime = time.time()
model.fit(X_train, Y_train)
print("Time to fit: %.3fs" % (time.time() - stime))

Time to fit: 14.314s


In [3]:
model.score(X_test, Y_test)

0.6840281774055557

## After Normalization

In [4]:
df['mileage'] = (df['mileage']-df['mileage'].mean())/df['mileage'].std() # This approach puts between 0 and 1, is that an issue?
df['year'] = (df['year']-df['year'].mean())/df['year'].std()
model = KernelRidge()
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

0.6840281774055557