In [259]:
import pandas as pd
import numpy as np

In [260]:
car_prices = pd.read_csv('./car_prices.csv')
car_prices_train = pd.read_csv('./car_prices_train.csv')
car_prices_test = pd.read_csv('./car_prices_test.csv')

car_prices.describe()

Unnamed: 0,year,mileage,price
count,10000.0,10000.0,10000.0
mean,2013.828,55798.059,29376.222
std,2.627376,25128.85138,11660.361084
min,2010.0,10284.0,10022.0
25%,2011.0,35137.5,19071.25
50%,2014.0,55739.5,29418.5
75%,2016.0,77193.75,39133.5
max,2018.0,99783.0,49998.0


In [261]:
car_prices.dtypes

make       object
model      object
year        int64
mileage     int64
price       int64
color      object
state      object
date       object
dtype: object

In [262]:
categorical_columns = [col for col in car_prices.columns if car_prices[col].dtype == 'object']
numerical_columns= [col for col in car_prices.columns if car_prices[col].dtype in ['int64', 'float64']]

categorical_columns, numerical_columns

(['make', 'model', 'color', 'state', 'date'], ['year', 'mileage', 'price'])

In [263]:
## compute the # of nan values

nan_columns = [col for col in car_prices.columns  if car_prices[col].hasnans]

nan_columns # No columns have nan values?

[]

In [264]:
num_unique = {}


for each_col in categorical_columns:
    num_unique[each_col] = car_prices[each_col].nunique()


In [265]:
## format the date column

car_prices_clone = car_prices.copy()

def dateitize_column(dataframe, column_name):
    dataframe[column_name] = dataframe[column_name].astype("datetime64")
    dataframe['{}_year'.format(column_name)] =  dataframe[column_name].map(lambda x: x.year)
    dataframe['{}_month'.format(column_name)] = dataframe[column_name].map(lambda x: x.month)
    dataframe['{}_day'.format(column_name)] = dataframe[column_name].map(lambda x: x.day)
    dataframe['{}_hour'.format(column_name)] = dataframe[column_name].map(lambda x: x.hour)
    dataframe['{}_minute'.format(column_name)] = dataframe[column_name].map(lambda x: x.minute)
    dataframe['{}_second'.format(column_name)] = dataframe[column_name].map(lambda x: x.second)
    dataframe.drop(columns=[column_name], inplace=True)


dateitize_column(car_prices_clone, 'date')

car_prices_clone.dtypes

make           object
model          object
year            int64
mileage         int64
price           int64
color          object
state          object
date_year       int64
date_month      int64
date_day        int64
date_hour       int64
date_minute     int64
date_second     int64
dtype: object

In [266]:
car_prices_clone['mileage_price_ratio'] = car_prices_clone['mileage'] / car_prices_clone['price']

In [267]:
car_prices_clone.dtypes

make                    object
model                   object
year                     int64
mileage                  int64
price                    int64
color                   object
state                   object
date_year                int64
date_month               int64
date_day                 int64
date_hour                int64
date_minute              int64
date_second              int64
mileage_price_ratio    float64
dtype: object

## since the # of unique values is reasonably low, aka low cardinality among the categorical columns, we can use one-hot-encoding

In [268]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [269]:
categorical_columns = [col for col in car_prices_clone.columns if car_prices_clone[col].dtype == 'object']

categorical_columns

['make', 'model', 'color', 'state']

In [270]:
categorical_transformer = ColumnTransformer(transformers=[('categorical', OrdinalEncoder(), categorical_columns)])
categorical_transformer_v2 = ColumnTransformer(transformers=[('categorical', OneHotEncoder(), categorical_columns)])

pipeline = Pipeline(steps=[('transform', categorical_transformer), ('model', RandomForestRegressor(n_estimators=155))])
pipeline_v2 = Pipeline(steps=[('transform', categorical_transformer_v2), ('model', RandomForestRegressor(n_estimators=50))])
pipeline_v3_oe = Pipeline(steps=[('transform', categorical_transformer), ('model', RandomForestRegressor(n_estimators=50))])
pipeline_v3_ohe = Pipeline(steps=[('transform', categorical_transformer_v2), ('model', RandomForestRegressor(n_estimators=50))])

fitting_clone = car_prices_clone.copy()
fitting_wo_date = fitting_clone.copy()

fitting_wo_date.drop(columns=[col for col in fitting_wo_date.columns if 'date' in col], inplace=True)

y = fitting_clone.pop('price')
y_wo = fitting_wo_date.pop('price')

train_x, validation_x, train_y, validation_y = train_test_split(fitting_clone, y, random_state=1)
train_x_wo, validation_x_wo, train_y_wo, validation_y_wo = train_test_split(fitting_wo_date, y_wo, random_state=1)

In [271]:
from sklearn.metrics import mean_squared_error

pipeline.fit(train_x, train_y)
pipeline_v2.fit(train_x, train_y)
pipeline_v3_oe.fit(train_x_wo, train_y_wo)
pipeline_v3_ohe.fit(train_x_wo, train_y_wo)

predictions_oe = pipeline.predict(validation_x)
predictions_ohe = pipeline_v2.predict(validation_x)
predictions_wo_oe = pipeline_v3_oe.predict(validation_x_wo)
predictions_wo_ohe = pipeline_v3_ohe.predict(validation_x_wo)

mean_squared_error(predictions_oe, validation_y), mean_squared_error(predictions_ohe, validation_y), mean_squared_error(predictions_wo_oe, validation_y_wo), mean_squared_error(predictions_wo_ohe, validation_y_wo)

(73501166.73370142, 73374288.06837273, 73470313.06628482, 73709609.93011108)