In [1]:
from platform import python_version
import pandas as pd
import numpy as np

import pickle

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
print('Python version:', python_version())

%reload_ext watermark
%watermark -a "Fernanda J. Dellajustina" --iversions

Python version: 3.10.4
Author: Fernanda J. Dellajustina

numpy : 1.23.0
sys   : 3.10.4 | packaged by conda-forge | (main, Mar 30 2022, 08:38:02) [MSC v.1916 64 bit (AMD64)]
joblib: 1.1.0
pandas: 1.4.3



In [3]:
# load dataset
data = pd.read_csv('dataset/cars.csv')

data['MSRP'] = data['MSRP'].map(lambda x: x.lstrip('$').replace(',',''))
data['MSRP'] = pd.to_numeric(data['MSRP'])

# rows and columns of the data
print(data.shape)

# visualise the dataset
data.head()

(428, 15)


Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,36945,"$33,337",3.5,6.0,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,23820,"$21,761",2.0,4.0,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,26990,"$24,647",2.4,4.0,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,33195,"$30,299",3.2,6.0,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,43755,"$39,014",3.5,6.0,225,18,24,3880,115,197


# Separate dataset into train and test

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Invoice','Model', 'MSRP'], axis=1), # predictive variables
    data['MSRP'], # target
    test_size=0.2, # portion of dataset to allocate to test set
    random_state=42, # we are setting the seed here
)

X_train.shape, X_test.shape

((342, 12), (86, 12))

# Feature Engineering

## Target

In [5]:
y_train = np.log(y_train)
y_test = np.log(y_test)

## Missing values

In [6]:
cylinder_mean = dict(round(X_train.groupby(['Type'])['Cylinders'].mean(), 0))

X_train.loc[X_train[X_train['Cylinders'].isnull()].index, 'Cylinders'] = [
        cylinder_mean[cyl] for cyl in X_train[X_train['Cylinders'].isnull()]['Type']]

X_test.loc[X_test[X_test['Cylinders'].isnull()].index, 'Cylinders'] = [
        cylinder_mean[cyl] for cyl in X_test[X_test['Cylinders'].isnull()]['Type']]

## Encoding categorical variables

In [7]:
cat_vars = [var for var in X_train.columns if X_train[var].dtype == 'O']
cat_vars

['Make', 'Type', 'Origin', 'DriveTrain']

In [8]:
encode_labels_dict = {
    'Make': {},
    'Type': {},
    'Origin': {},
    'DriveTrain':{}
}
for var in encode_labels_dict.keys():
    for i, label in enumerate(list(set(X_train[var]))):
        encode_labels_dict[var][label] = i

    X_train[var] = X_train[var].map(encode_labels_dict[var])
    X_test[var] = X_test[var].map(encode_labels_dict[var])
    
X_train.head(10)

Unnamed: 0,Make,Type,Origin,DriveTrain,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
66,23,3,1,0,2.5,6.0,165,19,22,2866,98,163
132,12,4,1,0,3.9,6.0,193,17,23,4275,121,201
223,6,4,2,1,3.0,6.0,215,18,24,3285,105,177
31,28,4,0,2,2.5,6.0,184,19,27,3461,107,176
84,23,1,1,2,5.3,8.0,295,14,18,5678,130,222
357,37,4,2,2,2.5,4.0,165,20,27,3495,104,184
168,21,4,2,0,1.6,4.0,103,29,33,2255,96,167
218,6,3,2,2,4.7,8.0,235,15,19,4740,110,188
414,16,2,0,0,1.8,4.0,170,22,31,3338,106,184
298,31,2,2,0,2.4,4.0,160,25,31,3020,102,181


In [9]:
X_train.shape, X_test.shape

((342, 12), (86, 12))

In [10]:
X_train.columns

Index(['Make', 'Type', 'Origin', 'DriveTrain', 'EngineSize', 'Cylinders',
       'Horsepower', 'MPG_City', 'MPG_Highway', 'Weight', 'Wheelbase',
       'Length'],
      dtype='object')

In [11]:
set(data['Cylinders'])

{3.0, 4.0, 5.0, 6.0, 8.0, 10.0, 12.0, nan, nan}

# Building the model

In [12]:
model = GradientBoostingRegressor(n_estimators = 5000, 
                                   learning_rate = 0.1,
                                   max_depth = 10,
                                   min_samples_leaf = 3,
                                   max_features = 0.1,
                                   loss = 'lad',
                                   random_state = 42)

model.fit(X_train, y_train)                                 



In [13]:
# evaluate the model:
# ====================

# remember that we log transformed the output (MSRP) in our feature engineering.

# In order to get the true performance of the model we need to transform both the target and the predictions
# back to the original cars prices values.

# We will evaluate performance using the mean squared error and the root of the mean squared error and r2

# make predictions for train set
pred = model.predict(X_train)

# determine mse, rmse and r2
print('train mse: {}'.format(int(mean_squared_error(np.exp(y_train), np.exp(pred)))))
print('train rmse: {}'.format(int(mean_squared_error(np.exp(y_train), np.exp(pred), squared=False))))
print('train r2: {}'.format(r2_score(np.exp(y_train), np.exp(pred))))
print()

# make predictions for test set
pred = model.predict(X_test)

# determine mse, rmse and r2
print('test mse: {}'.format(int(mean_squared_error(np.exp(y_test), np.exp(pred)))))
print('test rmse: {}'.format(int(mean_squared_error(np.exp(y_test), np.exp(pred), squared=False))))
print('test r2: {}'.format(r2_score(np.exp(y_test), np.exp(pred))))
print()

print('Average car price: ', int(np.exp(y_train).median()))

train mse: 6901812
train rmse: 2627
train r2: 0.9821316496480916

test mse: 23686011
test rmse: 4866
test r2: 0.9295586021987212

Average car price:  27269


In [14]:
pickle_out = open("../model/model.pkl", "wb")
pickle.dump(model, pickle_out)
pickle_out.close()