## Data preparation

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import time
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
from tensorflow.keras import Sequential, layers, datasets
from tensorflow.keras.utils import to_categorical
import random
random_state=42
random.seed(random_state)
np.random.seed(random_state)

In [2]:
data=pd.read_csv('/datasets/car_data.csv')

In [3]:
data

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Mileage,RegistrationMonth,FuelType,Brand,NotRepaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
0,24/03/2016 11:52,480,,1993,manual,0,golf,150000,0,petrol,volkswagen,,24/03/2016 00:00,0,70435,07/04/2016 03:16
1,24/03/2016 10:58,18300,coupe,2011,manual,190,,125000,5,gasoline,audi,yes,24/03/2016 00:00,0,66954,07/04/2016 01:46
2,14/03/2016 12:52,9800,suv,2004,auto,163,grand,125000,8,gasoline,jeep,,14/03/2016 00:00,0,90480,05/04/2016 12:47
3,17/03/2016 16:54,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,17/03/2016 00:00,0,91074,17/03/2016 17:40
4,31/03/2016 17:25,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,31/03/2016 00:00,0,60437,06/04/2016 10:17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354364,21/03/2016 09:50,0,,2005,manual,0,colt,150000,7,petrol,mitsubishi,yes,21/03/2016 00:00,0,2694,21/03/2016 10:42
354365,14/03/2016 17:48,2200,,2005,,0,,20000,1,,sonstige_autos,,14/03/2016 00:00,0,39576,06/04/2016 00:46
354366,05/03/2016 19:56,1199,convertible,2000,auto,101,fortwo,125000,3,petrol,smart,no,05/03/2016 00:00,0,26135,11/03/2016 18:17
354367,19/03/2016 18:57,9200,bus,1996,manual,102,transporter,150000,3,gasoline,volkswagen,no,19/03/2016 00:00,0,87439,07/04/2016 07:15


In [4]:
data.columns=data.columns.str.lower()
data=data.rename(columns={'datecrawled': 'date_crawled', 'price': 'price', 'vehicleType': 'vehicle_type', 
                        'registrationyear': 'registration_year', 'gearbox': 'gearbox', 'power': 'power', 
                        'model': 'model','mileage': 'milage', 'registrationmonth': 'registration_month', 
                        'FuelType': 'fuel_type', 'Brand': 'brand','NotRepaired': 'not_repaired', 
                        'datecreated': 'date_created', 'numberofpictures': 'num_pictures', 
                        'postalcode': 'postal_code', 'lastSeen': 'last_seen', })



In [5]:
print(data.shape)
data.isna().sum()
data["registration_year"].value_counts()
data[data["registration_year"] > 2022]
data[data["registration_year"] < 1960 ]




(354369, 16)


Unnamed: 0,date_crawled,price,vehicletype,registration_year,gearbox,power,model,milage,registration_month,fueltype,brand,notrepaired,date_created,num_pictures,postal_code,lastseen
15,11/03/2016 21:39,450,small,1910,,0,ka,5000,0,petrol,ford,,11/03/2016 00:00,0,24148,19/03/2016 08:46
622,16/03/2016 16:55,0,,1111,,0,,5000,0,,opel,,16/03/2016 00:00,0,44628,20/03/2016 16:44
1928,25/03/2016 15:58,7000,suv,1945,manual,48,other,150000,2,petrol,volkswagen,no,25/03/2016 00:00,0,58135,25/03/2016 15:58
2273,15/03/2016 21:44,1800,convertible,1925,,0,,5000,1,,sonstige_autos,no,15/03/2016 00:00,0,79288,07/04/2016 05:15
3333,15/03/2016 21:36,10500,sedan,1955,manual,30,other,60000,0,petrol,ford,,15/03/2016 00:00,0,53498,07/04/2016 08:16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351299,09/03/2016 21:56,5500,bus,1956,manual,37,,60000,4,petrol,sonstige_autos,no,09/03/2016 00:00,0,1900,06/04/2016 02:17
351682,12/03/2016 00:57,11500,,1800,,16,other,5000,6,petrol,fiat,,11/03/2016 00:00,0,16515,05/04/2016 19:47
353531,16/03/2016 21:56,6000,sedan,1937,manual,38,other,5000,0,petrol,mercedes_benz,,16/03/2016 00:00,0,23936,30/03/2016 18:47
353961,17/03/2016 13:54,200,,1910,,0,,5000,0,petrol,sonstige_autos,,17/03/2016 00:00,0,42289,31/03/2016 22:46


In [6]:
data.duplicated().sum()

262

In [7]:
data= data.drop_duplicates()
data.duplicated().sum()

0

In [8]:
data.info()
data.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 354107 entries, 0 to 354368
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   date_crawled        354107 non-null  object
 1   price               354107 non-null  int64 
 2   vehicletype         316623 non-null  object
 3   registration_year   354107 non-null  int64 
 4   gearbox             334277 non-null  object
 5   power               354107 non-null  int64 
 6   model               334406 non-null  object
 7   milage              354107 non-null  int64 
 8   registration_month  354107 non-null  int64 
 9   fueltype            321218 non-null  object
 10  brand               354107 non-null  object
 11  notrepaired         282962 non-null  object
 12  date_created        354107 non-null  object
 13  num_pictures        354107 non-null  int64 
 14  postal_code         354107 non-null  int64 
 15  lastseen            354107 non-null  object
dtypes:

date_crawled              0
price                     0
vehicletype           37484
registration_year         0
gearbox               19830
power                     0
model                 19701
milage                    0
registration_month        0
fueltype              32889
brand                     0
notrepaired           71145
date_created              0
num_pictures              0
postal_code               0
lastseen                  0
dtype: int64

In [9]:
data.isna().sum()/len(data)

date_crawled          0.000000
price                 0.000000
vehicletype           0.105855
registration_year     0.000000
gearbox               0.056000
power                 0.000000
model                 0.055636
milage                0.000000
registration_month    0.000000
fueltype              0.092879
brand                 0.000000
notrepaired           0.200914
date_created          0.000000
num_pictures          0.000000
postal_code           0.000000
lastseen              0.000000
dtype: float64

In [10]:
print(.10+.056+.056+.09+.20)

0.502


* The portion of missing data is pretty significant overall with it being about 50% of data missing. I will investigate each column further

In [11]:
data["num_pictures"].value_counts()
data=data.drop(["num_pictures"], axis=1)

In [12]:
data['notrepaired'].value_counts()


no     246927
yes     36035
Name: notrepaired, dtype: int64

* The num_pictures column does not contain any other value besides '0', which makes this column useless in model training, so I will remove this entire column in preparation for training. 
* In the ['notrepaired'] column, 20% of the data is missing. I chose to replace the missing values with 'no' since the overwhelming majority value is 'no'. 

In [13]:
data['notrepaired'].value_counts()
data["notrepaired"].fillna('no', inplace=True)
data['notrepaired'].isna().sum()

0

In [14]:
data=data.query('registration_year < 2019 and registration_year > 1960')
data.isna().sum() / len(data)

date_crawled          0.000000
price                 0.000000
vehicletype           0.104973
registration_year     0.000000
gearbox               0.055016
power                 0.000000
model                 0.054816
milage                0.000000
registration_month    0.000000
fueltype              0.092057
brand                 0.000000
notrepaired           0.000000
date_created          0.000000
postal_code           0.000000
lastseen              0.000000
dtype: float64

* I removed rows which contained values in registration_year column that were either irrelevant/didn't make sense which included the above query.

In [15]:
data["fueltype"].value_counts()

petrol      215822
gasoline     98627
lpg           5306
cng            563
hybrid         232
other          197
electric        90
Name: fueltype, dtype: int64

In [16]:
data["postal_code"].value_counts()
inaccurate_postal= data.query('postal_code < 10000')
len(inaccurate_postal)/len(data)

0.05115078657599606

* There are about 5% of rows in the data that contain postal codes that are not 5-digits, and thus inaccurate.

In [17]:
data.corr()

Unnamed: 0,price,registration_year,power,milage,registration_month,postal_code
price,1.0,0.389955,0.159506,-0.335997,0.110227,0.075899
registration_year,0.389955,1.0,0.034142,-0.200124,0.042043,0.023928
power,0.159506,0.034142,1.0,0.023,0.042721,0.021489
milage,-0.335997,-0.200124,0.023,1.0,0.005819,-0.008513
registration_month,0.110227,0.042043,0.042721,0.005819,1.0,0.013642
postal_code,0.075899,0.023928,0.021489,-0.008513,0.013642,1.0


* It also looks like postal code doesn't seem to correlate with price (has the lowest corr value compared to other factors), which means it might be best to just eliminate the entire column. 

In [18]:
data.drop(["postal_code"], axis=1, inplace=True)
data.columns

Index(['date_crawled', 'price', 'vehicletype', 'registration_year', 'gearbox',
       'power', 'model', 'milage', 'registration_month', 'fueltype', 'brand',
       'notrepaired', 'date_created', 'lastseen'],
      dtype='object')

In [19]:
data.drop(["date_crawled"], axis=1, inplace=True)
data.drop(["date_created"], axis=1, inplace=True)


In [20]:
data.drop(["lastseen"], axis=1, inplace=True)

* The columns including dates such as [date_crawled], [date_created] and [last_seen] will not be purposeful in model training, so they have also been removed.

In [21]:
data.columns

Index(['price', 'vehicletype', 'registration_year', 'gearbox', 'power',
       'model', 'milage', 'registration_month', 'fueltype', 'brand',
       'notrepaired'],
      dtype='object')

In [22]:
data.isna().sum()
data[data["price"] == 0]

Unnamed: 0,price,vehicletype,registration_year,gearbox,power,model,milage,registration_month,fueltype,brand,notrepaired
7,0,sedan,1980,manual,50,other,40000,7,petrol,volkswagen,no
40,0,,1990,,0,corsa,150000,1,petrol,opel,no
111,0,,2017,manual,0,golf,5000,12,petrol,volkswagen,no
115,0,small,1999,,0,,5000,0,petrol,volkswagen,no
152,0,bus,2004,manual,101,meriva,150000,10,lpg,opel,yes
...,...,...,...,...,...,...,...,...,...,...,...
354205,0,,2000,manual,65,corsa,150000,0,,opel,yes
354238,0,small,2002,manual,60,fiesta,150000,3,petrol,ford,no
354248,0,small,1999,manual,53,swift,150000,3,petrol,suzuki,no
354277,0,small,1999,manual,37,arosa,150000,7,petrol,seat,yes


* There are 10608 rows that have a price value of '0'. While the reasons may vary as to why the price of these cars are listed as 0 (i.e., repo, error, etc.), it would not be meaningful to use these rows for model training/analysis. 

In [23]:
data= data.loc[data["price"] != 0]

In [24]:
data.loc[data["price"] == 0]

Unnamed: 0,price,vehicletype,registration_year,gearbox,power,model,milage,registration_month,fueltype,brand,notrepaired


In [25]:
data['model'].isna().sum() / len(data)


0.050434853643522126

In [26]:
data['model'].nunique()

250

Since there are hundreds of different unique values for [model] and only 5% of this column is missing data, I will drop these missing values. (Guessing the model type of each type would not be helpful here)

In [27]:
data.dropna(subset=['model'], inplace=True)
data["model"].isna().sum()

0

In [28]:
data.isna().sum()

price                     0
vehicletype           28002
registration_year         0
gearbox               14122
power                     0
model                     0
milage                    0
registration_month        0
fueltype              23407
brand                     0
notrepaired               0
dtype: int64

In [29]:
data.isnull().sum() * 100 / len(data)

price                 0.000000
vehicletype           8.603505
registration_year     0.000000
gearbox               4.338929
power                 0.000000
model                 0.000000
milage                0.000000
registration_month    0.000000
fueltype              7.191709
brand                 0.000000
notrepaired           0.000000
dtype: float64

In [30]:
data['gearbox'].value_counts(dropna=False)

manual    249596
auto       61754
NaN        14122
Name: gearbox, dtype: int64

In [31]:
data['vehicletype'].value_counts(dropna=False)

sedan          86207
small          74807
wagon          61646
NaN            28002
bus            27400
convertible    18998
coupe          14744
suv            11039
other           2629
Name: vehicletype, dtype: int64

In [32]:
data["fueltype"].value_counts(dropna=False)

petrol      201885
gasoline     94419
NaN          23407
lpg           4842
cng            532
hybrid         204
other          118
electric        65
Name: fueltype, dtype: int64

In [33]:
data['vehicletype'].fillna(data.groupby('model')['vehicletype'].
                          transform(lambda x:x.value_counts().index[0]), inplace=True)


The [vehicletype] column is missing 8.6% of data. I will fill missing values for vehicle type based on model since the model can 

In [34]:
data['vehicletype'].value_counts(dropna=False)

sedan          99170
small          82913
wagon          64670
bus            30317
convertible    19226
coupe          15081
suv            11464
other           2631
Name: vehicletype, dtype: int64

In [35]:
data.isna().sum()

price                     0
vehicletype               0
registration_year         0
gearbox               14122
power                     0
model                     0
milage                    0
registration_month        0
fueltype              23407
brand                     0
notrepaired               0
dtype: int64

Since gearbox and fueltype is also related to car model, I will also fill in missing values for [gearbox] and [fueltype] based on model.

In [36]:
data['gearbox'].fillna(data.groupby('model')['gearbox'].
                          transform(lambda x:x.value_counts().index[0]), inplace=True)


In [37]:
data.isna().sum()

price                     0
vehicletype               0
registration_year         0
gearbox                   0
power                     0
model                     0
milage                    0
registration_month        0
fueltype              23407
brand                     0
notrepaired               0
dtype: int64

In [38]:
data['fueltype'].value_counts(dropna=False)

petrol      201885
gasoline     94419
NaN          23407
lpg           4842
cng            532
hybrid         204
other          118
electric        65
Name: fueltype, dtype: int64

In [39]:
data['fueltype'].fillna(data.groupby('model')['fueltype'].
                          transform(lambda x:x.value_counts().index[0]), inplace=True)


In [40]:
data.isna().sum()

price                 0
vehicletype           0
registration_year     0
gearbox               0
power                 0
model                 0
milage                0
registration_month    0
fueltype              0
brand                 0
notrepaired           0
dtype: int64

* There are no more missing values in the dataset. 

* Summary:
    * We dropped several columns, 'date_crawled', 'date_created', 'last_seen', 'num_pictures', 'postal_code', that will not be useful for our analysis.
    * We eliminated rows where the registration_year > 2016 (the year of the data) or < 1960 (unlikely).
    * We eliminated rows where the price = 0.
    * We eliminated the rows where a value for model was missing.
    * We filled in missing rows in vehicle_type, gearbox, fuel_type, power based on model.
    * We changed the zero values in registration_month to a randomly assigned # (1-12).
    * We replaced the NaN values in not_repaired with 'no'.
    * We verified that no missing values remain.
    * We note our preparation eliminated almost 12% of the data, but feel confident in the deletion choices.

## Model training

In [41]:
data['vehicletype']

0               sedan
2                 suv
3               small
4               small
5               sedan
             ...     
354362          sedan
354363            bus
354366    convertible
354367            bus
354368          wagon
Name: vehicletype, Length: 325472, dtype: object

In [42]:
#change into categories for lightBGM and CatBoost
list_cat = ['vehicletype', 'gearbox', 'model', 'fueltype', 'brand', 'notrepaired']
for col in list_cat:
    data[col] = data[col].astype('category')

In [43]:
# define target and features
target= data["price"]
features=data.drop(['price'], axis=1)

In [44]:
#convert features to numeric 
feat_ohe=pd.get_dummies(features, drop_first=True)

In [45]:
#split data for training with OHE
features_train, features_valid, target_train, target_valid=train_test_split(feat_ohe, target, test_size=.4, random_state=12345)
features_valid, features_test, target_valid, target_test=train_test_split(features_valid, target_valid, test_size=.5, shuffle=False)


In [46]:
print('Size of Features Train', len(features_train)/len(data))
print('Size of Target Train', len(target_train)/len(data)) 
print('Size of Features Valid', len(features_valid)/len(data))
print('Size of Target Valid', len(target_valid)/len(data))
print('Size of Features Test', len(features_test)/ len(data))
print('Size of Target Test', len(target_test)/len(data))

Size of Features Train 0.5999993855078163
Size of Target Train 0.5999993855078163
Size of Features Valid 0.19999877101563268
Size of Target Valid 0.19999877101563268
Size of Features Test 0.20000184347655098
Size of Target Test 0.20000184347655098


In [47]:
#split data for training without OHE
features_train1, features_valid1, target_train1, target_valid1=train_test_split(features, target, test_size=.4, random_state=12345)
features_valid1, features_test1, target_valid1, target_test1 = train_test_split(
    features_valid1, target_valid1, test_size=0.5, shuffle = False)

In [48]:
print('Size of Features Train', len(features_train1)/len(data))
print('Size of Target Train', len(target_train1)/len(data)) 
print('Size of Features Valid', len(features_valid1)/len(data))
print('Size of Target Valid', len(target_valid1)/len(data))
print('Size of Features Test', len(features_test1)/ len(data))
print('Size of Target Test', len(target_test1)/len(data))

Size of Features Train 0.5999993855078163
Size of Target Train 0.5999993855078163
Size of Features Valid 0.19999877101563268
Size of Target Valid 0.19999877101563268
Size of Features Test 0.20000184347655098
Size of Target Test 0.20000184347655098


In [49]:
def find_rmse(target_test, predictions):
    return round(mean_squared_error(target_test, predictions) ** 0.5, 2)
rmse = make_scorer(find_rmse, greater_is_better=False)

In [50]:
%%time
# linear regression with default parameters
lr_model = LinearRegression()

start = time.time()
lr_model.fit(features_train, target_train)
end = time.time()
lrtt = end - start

start = time.time()
predicted_valid = lr_model.predict(features_valid)
end = time.time()
lrpt = end - start

lr_rmse_calc = mean_squared_error(target_valid, predicted_valid)**0.5

print('Linear Regression - Sanity Check')
print('RMSE:', lr_rmse_calc, 'Training time:', lrtt, 'Prediction time:', lrpt)

Linear Regression - Sanity Check
RMSE: 2895.0483751307856 Training time: 21.69653344154358 Prediction time: 0.3407108783721924
CPU times: user 13 s, sys: 9 s, total: 22 s
Wall time: 22.1 s


* When it comes to timing the cells, "%%time" tells us the total cell time. However, to find the time for training and prediction, we just found those by subtracting the start and end times of each respective variable. 

In [51]:
%%time
# random forest regressor with default parameters
rf_model = RandomForestRegressor(random_state=42)

start = time.time()
rf_model.fit(features_train, target_train)
end = time.time()
rftt = end - start

start = time.time()
predicted_valid = rf_model.predict(features_valid)
end = time.time()
rfpt = end - start

rf_rmse_calc = mean_squared_error(target_valid, predicted_valid)**0.5

print('Random Forest Regressor')
print('RMSE:', rf_rmse_calc, 'Training time:', rftt, 'Prediction time:', rfpt)

Random Forest Regressor
RMSE: 1637.1681637261606 Training time: 446.7893626689911 Prediction time: 4.471585512161255
CPU times: user 7min 28s, sys: 2.2 s, total: 7min 30s
Wall time: 7min 31s


In [52]:
%%time
# lightGBM with OHE 
lg_model = lgb.LGBMRegressor(random_state=42)

start = time.time()
lg_model.fit(features_train, target_train)
end = time.time()
lgohett = end - start

start = time.time()
predicted_valid = lg_model.predict(features_valid)
end = time.time()
lgohept = end - start

lgohe_rmse_calc = mean_squared_error(target_valid, predicted_valid)**0.5

print('LightGBM with OHE')
print('RMSE:', lgohe_rmse_calc, 'Training time:', lgohett, 'Prediction time:', lgohept)

LightGBM with OHE
RMSE: 1726.6530504000457 Training time: 322.7245354652405 Prediction time: 0.7187302112579346
CPU times: user 5min 15s, sys: 5.82 s, total: 5min 20s
Wall time: 5min 23s


In [53]:
%%time
# lightGBM without OHE 
lg_model_2 = lgb.LGBMRegressor(random_state=42)

start = time.time()
lg_model_2.fit(features_train1, target_train1, categorical_feature=list_cat)
end = time.time()
lgtt = end - start

start = time.time()
predicted_valid = lg_model_2.predict(features_valid1)
end = time.time()
lgpt = end - start

lg_rmse_calc = mean_squared_error(target_valid1, predicted_valid)**0.5

print('LightGBM without OHE')
print('RMSE:', lg_rmse_calc, 'Training time:', lgtt, 'Prediction time:', lgpt)



LightGBM without OHE
RMSE: 1661.1798804914488 Training time: 304.41242814064026 Prediction time: 0.6013398170471191
CPU times: user 4min 57s, sys: 4.99 s, total: 5min 2s
Wall time: 5min 5s


In [54]:
%%time
# CatBoost with OHE 
cb_model = CatBoostRegressor(random_state=42)

start = time.time()
cb_model.fit(features_train, target_train)
end = time.time()
cbohett = end - start

start = time.time()
predicted_valid = cb_model.predict(features_valid)
end = time.time()
cbohept = end - start

cbohe_rmse_calc = mean_squared_error(target_valid, predicted_valid)**0.5

print('CatBoost with OHE')
print('RMSE:', cbohe_rmse_calc, 'Training time:', cbohett, 'Prediction time:', cbohept)

Learning rate set to 0.094212
0:	learn: 4254.1960856	total: 99.7ms	remaining: 1m 39s
1:	learn: 4017.7986012	total: 145ms	remaining: 1m 12s
2:	learn: 3808.3650311	total: 186ms	remaining: 1m 1s
3:	learn: 3623.3637312	total: 229ms	remaining: 57.1s
4:	learn: 3464.5604542	total: 275ms	remaining: 54.7s
5:	learn: 3309.7265803	total: 320ms	remaining: 53s
6:	learn: 3172.7539333	total: 364ms	remaining: 51.7s
7:	learn: 3053.0986452	total: 412ms	remaining: 51.1s
8:	learn: 2945.4418364	total: 455ms	remaining: 50s
9:	learn: 2858.1406499	total: 498ms	remaining: 49.3s
10:	learn: 2780.3721119	total: 541ms	remaining: 48.6s
11:	learn: 2705.1779340	total: 583ms	remaining: 48s
12:	learn: 2638.5194748	total: 627ms	remaining: 47.6s
13:	learn: 2582.7812396	total: 671ms	remaining: 47.2s
14:	learn: 2536.6122981	total: 708ms	remaining: 46.5s
15:	learn: 2494.4995530	total: 745ms	remaining: 45.8s
16:	learn: 2450.1961232	total: 782ms	remaining: 45.2s
17:	learn: 2416.5854836	total: 820ms	remaining: 44.7s
18:	learn: 

In [55]:
%%time
# CatBoost without OHE 
cb_model = CatBoostRegressor(random_state=42)

start = time.time()
cb_model.fit(features_train1, target_train1, cat_features=list_cat)
end = time.time()
cbtt = end - start

start = time.time()
predicted_valid = cb_model.predict(features_valid1)
end = time.time()
cbpt = end - start

cb_rmse_calc = mean_squared_error(target_valid1, predicted_valid)**0.5

print('CatBoost without OHE')
print('RMSE:', cb_rmse_calc, 'Training time:', cbtt, 'Prediction time:', cbpt)

Learning rate set to 0.094212
0:	learn: 4262.7670608	total: 236ms	remaining: 3m 55s
1:	learn: 4028.6188093	total: 424ms	remaining: 3m 31s
2:	learn: 3816.5030718	total: 598ms	remaining: 3m 18s
3:	learn: 3617.6850434	total: 790ms	remaining: 3m 16s
4:	learn: 3449.0523481	total: 1.01s	remaining: 3m 20s
5:	learn: 3296.9608510	total: 1.16s	remaining: 3m 12s
6:	learn: 3163.3646527	total: 1.33s	remaining: 3m 9s
7:	learn: 3033.1659313	total: 1.52s	remaining: 3m 8s
8:	learn: 2921.8177580	total: 1.62s	remaining: 2m 58s
9:	learn: 2824.1457560	total: 1.79s	remaining: 2m 57s
10:	learn: 2740.7603936	total: 1.99s	remaining: 2m 58s
11:	learn: 2664.4771051	total: 2.14s	remaining: 2m 56s
12:	learn: 2597.2588642	total: 2.3s	remaining: 2m 54s
13:	learn: 2537.7390073	total: 2.44s	remaining: 2m 51s
14:	learn: 2481.1122667	total: 2.58s	remaining: 2m 49s
15:	learn: 2432.4523037	total: 2.75s	remaining: 2m 49s
16:	learn: 2383.4630254	total: 2.93s	remaining: 2m 49s
17:	learn: 2342.0381238	total: 3.06s	remaining: 

In [56]:
print('Linear Regression - Sanity Check')
print('RMSE:', lr_rmse_calc, 'Training time:', lrtt, 'Prediction time:', lrpt)
print('\nRandom Forest Regressor')
print('RMSE:', rf_rmse_calc, 'Training time:', rftt, 'Prediction time:', rfpt)
print('\nLightGBM with OHE')
print('RMSE:', lgohe_rmse_calc, 'Training time:', lgohett, 'Prediction time:', lgohept)
print('\nLightGBM without OHE')
print('RMSE:', lg_rmse_calc, 'Training time:', lgtt, 'Prediction time:', lgpt)
print('\nCatBoost with OHE')
print('RMSE:', cbohe_rmse_calc, 'Training time:', cbohett, 'Prediction time:', cbohept)
print('\nCatBoost without OHE')
print('RMSE:', cb_rmse_calc, 'Training time:', cbtt, 'Prediction time:', cbpt)

Linear Regression - Sanity Check
RMSE: 2895.0483751307856 Training time: 21.69653344154358 Prediction time: 0.3407108783721924

Random Forest Regressor
RMSE: 1637.1681637261606 Training time: 446.7893626689911 Prediction time: 4.471585512161255

LightGBM with OHE
RMSE: 1726.6530504000457 Training time: 322.7245354652405 Prediction time: 0.7187302112579346

LightGBM without OHE
RMSE: 1661.1798804914488 Training time: 304.41242814064026 Prediction time: 0.6013398170471191

CatBoost with OHE
RMSE: 1639.9786314398411 Training time: 32.960219860076904 Prediction time: 0.16595196723937988

CatBoost without OHE
RMSE: 1635.5431243365203 Training time: 149.42547154426575 Prediction time: 0.37122368812561035


* A few points to highlight:
    * Training times of the Linear regression models approximate from 20 seconds with LightGBM to 500 seconds with CatBoost.
    * When One Hot Encoding is not applied in LightGBM, it results in the lowest RMSE value.
    * All models resulted in a better RMSE than the Linear regression model (which we used as a sanity check), which is to be expected.


# Hyperparameter Tuning for RandomForestRegressor

In [57]:
%%time
rf_model = RandomForestRegressor(random_state=42)
params = { 'n_estimators': range(10, 30, 5) }

best_model = RandomizedSearchCV(rf_model, params, scoring=rmse, cv=5, verbose=10)
best_model.fit(features_train, target_train)  
print('Best parameters:', best_model.best_params_)

predictions = best_model.best_estimator_.predict(features_valid)
print('RMSE:', round(mean_squared_error(target_valid, predictions) ** 0.5, 2))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START n_estimators=10.............................................




[CV 1/5; 1/4] END ...........................n_estimators=10; total time=  40.3s
[CV 2/5; 1/4] START n_estimators=10.............................................
[CV 2/5; 1/4] END ...........................n_estimators=10; total time=  39.4s
[CV 3/5; 1/4] START n_estimators=10.............................................
[CV 3/5; 1/4] END ...........................n_estimators=10; total time=  39.3s
[CV 4/5; 1/4] START n_estimators=10.............................................
[CV 4/5; 1/4] END ...........................n_estimators=10; total time=  39.8s
[CV 5/5; 1/4] START n_estimators=10.............................................
[CV 5/5; 1/4] END ...........................n_estimators=10; total time=  39.9s
[CV 1/5; 2/4] START n_estimators=15.............................................
[CV 1/5; 2/4] END ...........................n_estimators=15; total time=  58.8s
[CV 2/5; 2/4] START n_estimators=15.............................................
[CV 2/5; 2/4] END ..........

# Hyperparameter Tuning for Catboost without applying One Hot Encoding

In [58]:
%%time
# CatBoost without OHE
cb_model_2 = CatBoostRegressor(random_state=42)
params = { 'n_estimators': range(10, 30, 5), 'learning_rate': [.25, .5, .75] }

best_model = RandomizedSearchCV(cb_model_2, params, scoring=rmse, cv=5, verbose=10)
best_model.fit(features_train1, target_train1, cat_features=list_cat) 
predictions = best_model.best_estimator_.predict(features_valid1)

print('CatBoost without OHE')
print('Best parameters:', best_model.best_params_)
print('RMSE:', round(mean_squared_error(target_valid1, predictions) ** 0.5, 2))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START learning_rate=0.75, n_estimators=20........................
0:	learn: 2888.2671829	total: 57.1ms	remaining: 1.08s
1:	learn: 2385.4416503	total: 120ms	remaining: 1.08s
2:	learn: 2226.4461303	total: 176ms	remaining: 997ms
3:	learn: 2136.5069824	total: 224ms	remaining: 895ms
4:	learn: 2095.6710101	total: 291ms	remaining: 872ms
5:	learn: 2059.3928623	total: 364ms	remaining: 850ms
6:	learn: 2014.1355608	total: 432ms	remaining: 803ms
7:	learn: 1986.4103283	total: 481ms	remaining: 721ms
8:	learn: 1972.8356151	total: 529ms	remaining: 646ms
9:	learn: 1955.0087871	total: 576ms	remaining: 576ms
10:	learn: 1936.6437669	total: 622ms	remaining: 509ms
11:	learn: 1919.0751472	total: 678ms	remaining: 452ms
12:	learn: 1903.2939466	total: 721ms	remaining: 388ms
13:	learn: 1897.2769533	total: 767ms	remaining: 329ms
14:	learn: 1887.7630354	total: 812ms	remaining: 271ms
15:	learn: 1873.9166381	total: 859ms	remaining: 215ms
16:

Hyperparameter Tuning for Catboost with applying One Hot Encoding

In [59]:
%%time
# CatBoost with OHE
cb_model = CatBoostRegressor(random_state=42)
params = { 'n_estimators': range(10, 30, 5), 'learning_rate': [.25, .5, .75] }

best_model = RandomizedSearchCV(cb_model, params, scoring=rmse, cv=5, verbose=10)
best_model.fit(features_train, target_train) 
predictions = best_model.best_estimator_.predict(features_valid)

print('CatBoost with OHE')
print('Best parameters:', best_model.best_params_)
print('RMSE:', round(mean_squared_error(target_valid, predictions) ** 0.5, 2))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START learning_rate=0.75, n_estimators=20........................
0:	learn: 2841.2899011	total: 21.7ms	remaining: 413ms
1:	learn: 2436.5709283	total: 44ms	remaining: 396ms
2:	learn: 2265.3575526	total: 65.2ms	remaining: 369ms
3:	learn: 2162.6983144	total: 86.4ms	remaining: 345ms
4:	learn: 2108.8099482	total: 107ms	remaining: 320ms
5:	learn: 2070.5866320	total: 136ms	remaining: 318ms
6:	learn: 2026.5392276	total: 159ms	remaining: 295ms
7:	learn: 1995.6288396	total: 180ms	remaining: 269ms
8:	learn: 1972.7736628	total: 202ms	remaining: 247ms
9:	learn: 1961.3991442	total: 223ms	remaining: 223ms
10:	learn: 1946.2171143	total: 244ms	remaining: 200ms
11:	learn: 1931.8543611	total: 271ms	remaining: 180ms
12:	learn: 1917.7677734	total: 299ms	remaining: 161ms
13:	learn: 1906.2539652	total: 330ms	remaining: 141ms
14:	learn: 1890.6916539	total: 362ms	remaining: 121ms
15:	learn: 1880.9367389	total: 387ms	remaining: 96.7ms
1

# Hyperparameter Tuning for LightGBM with One Hot Encoding

In [60]:
%%time
# LIghtGBM with OHE
lg_model = lgb.LGBMRegressor(random_state=42)
params = { 'n_estimators': range(10, 30, 5), 'learning_rate': [.25, .5, .75] }

best_model = RandomizedSearchCV(lg_model, params, scoring=rmse, cv=5, verbose=10)
best_model.fit(features_train, target_train) 
predictions = best_model.best_estimator_.predict(features_valid)

print('LightGBM with OHE')
print('Best parameters:', best_model.best_params_)
print('RMSE:', round(mean_squared_error(target_valid, predictions) ** 0.5, 2))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START learning_rate=0.5, n_estimators=25.........................
[CV 1/5; 1/10] END .......learning_rate=0.5, n_estimators=25; total time= 1.8min
[CV 2/5; 1/10] START learning_rate=0.5, n_estimators=25.........................
[CV 2/5; 1/10] END .......learning_rate=0.5, n_estimators=25; total time= 1.4min
[CV 3/5; 1/10] START learning_rate=0.5, n_estimators=25.........................
[CV 3/5; 1/10] END .......learning_rate=0.5, n_estimators=25; total time= 1.5min
[CV 4/5; 1/10] START learning_rate=0.5, n_estimators=25.........................
[CV 4/5; 1/10] END .......learning_rate=0.5, n_estimators=25; total time= 1.5min
[CV 5/5; 1/10] START learning_rate=0.5, n_estimators=25.........................
[CV 5/5; 1/10] END .......learning_rate=0.5, n_estimators=25; total time= 1.4min
[CV 1/5; 2/10] START learning_rate=0.5, n_estimators=10.........................
[CV 1/5; 2/10] END .......learning_rate=0.5, n_e

# Hyperparameter Tuning for LightGBM without applying One Hot Encoding 

In [61]:
%%time
# LightGBM without OHE
lg_model_2 = lgb.LGBMRegressor(random_state=42)
params = { 'n_estimators': range(10, 30, 5), 'learning_rate': [.25, .5, .75] }

best_model = RandomizedSearchCV(lg_model_2, params, scoring=rmse, cv=5, verbose=10)
best_model.fit(features_train1, target_train1, categorical_feature=list_cat)  
predictions = best_model.best_estimator_.predict(features_valid1)

print('LightGBM without OHE')
print('Best parameters:', best_model.best_params_)
print('RMSE:', round(mean_squared_error(target_valid1, predictions) ** 0.5, 2))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START learning_rate=0.75, n_estimators=15........................




[CV 1/5; 1/10] END ......learning_rate=0.75, n_estimators=15; total time= 1.2min
[CV 2/5; 1/10] START learning_rate=0.75, n_estimators=15........................




[CV 2/5; 1/10] END ......learning_rate=0.75, n_estimators=15; total time= 1.0min
[CV 3/5; 1/10] START learning_rate=0.75, n_estimators=15........................




[CV 3/5; 1/10] END ......learning_rate=0.75, n_estimators=15; total time=  49.7s
[CV 4/5; 1/10] START learning_rate=0.75, n_estimators=15........................




[CV 4/5; 1/10] END ......learning_rate=0.75, n_estimators=15; total time= 1.1min
[CV 5/5; 1/10] START learning_rate=0.75, n_estimators=15........................




[CV 5/5; 1/10] END ......learning_rate=0.75, n_estimators=15; total time=  52.1s
[CV 1/5; 2/10] START learning_rate=0.25, n_estimators=15........................




[CV 1/5; 2/10] END ......learning_rate=0.25, n_estimators=15; total time=  51.6s
[CV 2/5; 2/10] START learning_rate=0.25, n_estimators=15........................




[CV 2/5; 2/10] END ......learning_rate=0.25, n_estimators=15; total time= 1.1min
[CV 3/5; 2/10] START learning_rate=0.25, n_estimators=15........................




[CV 3/5; 2/10] END ......learning_rate=0.25, n_estimators=15; total time=  47.3s
[CV 4/5; 2/10] START learning_rate=0.25, n_estimators=15........................




[CV 4/5; 2/10] END ......learning_rate=0.25, n_estimators=15; total time= 1.1min
[CV 5/5; 2/10] START learning_rate=0.25, n_estimators=15........................




[CV 5/5; 2/10] END ......learning_rate=0.25, n_estimators=15; total time=  52.6s
[CV 1/5; 3/10] START learning_rate=0.5, n_estimators=15.........................




[CV 1/5; 3/10] END .......learning_rate=0.5, n_estimators=15; total time=  53.0s
[CV 2/5; 3/10] START learning_rate=0.5, n_estimators=15.........................




[CV 2/5; 3/10] END .......learning_rate=0.5, n_estimators=15; total time=  53.4s
[CV 3/5; 3/10] START learning_rate=0.5, n_estimators=15.........................




[CV 3/5; 3/10] END .......learning_rate=0.5, n_estimators=15; total time= 1.1min
[CV 4/5; 3/10] START learning_rate=0.5, n_estimators=15.........................




[CV 4/5; 3/10] END .......learning_rate=0.5, n_estimators=15; total time=  44.2s
[CV 5/5; 3/10] START learning_rate=0.5, n_estimators=15.........................




[CV 5/5; 3/10] END .......learning_rate=0.5, n_estimators=15; total time=  52.8s
[CV 1/5; 4/10] START learning_rate=0.25, n_estimators=10........................




[CV 1/5; 4/10] END ......learning_rate=0.25, n_estimators=10; total time=  30.3s
[CV 2/5; 4/10] START learning_rate=0.25, n_estimators=10........................




[CV 2/5; 4/10] END ......learning_rate=0.25, n_estimators=10; total time=  35.1s
[CV 3/5; 4/10] START learning_rate=0.25, n_estimators=10........................




[CV 3/5; 4/10] END ......learning_rate=0.25, n_estimators=10; total time=  30.6s
[CV 4/5; 4/10] START learning_rate=0.25, n_estimators=10........................




[CV 4/5; 4/10] END ......learning_rate=0.25, n_estimators=10; total time=  30.1s
[CV 5/5; 4/10] START learning_rate=0.25, n_estimators=10........................




[CV 5/5; 4/10] END ......learning_rate=0.25, n_estimators=10; total time=  35.7s
[CV 1/5; 5/10] START learning_rate=0.75, n_estimators=20........................




[CV 1/5; 5/10] END ......learning_rate=0.75, n_estimators=20; total time= 1.0min
[CV 2/5; 5/10] START learning_rate=0.75, n_estimators=20........................




[CV 2/5; 5/10] END ......learning_rate=0.75, n_estimators=20; total time=   1.9s
[CV 3/5; 5/10] START learning_rate=0.75, n_estimators=20........................




[CV 3/5; 5/10] END ......learning_rate=0.75, n_estimators=20; total time=   1.4s
[CV 4/5; 5/10] START learning_rate=0.75, n_estimators=20........................




[CV 4/5; 5/10] END ......learning_rate=0.75, n_estimators=20; total time=   1.4s
[CV 5/5; 5/10] START learning_rate=0.75, n_estimators=20........................




[CV 5/5; 5/10] END ......learning_rate=0.75, n_estimators=20; total time=   1.1s
[CV 1/5; 6/10] START learning_rate=0.5, n_estimators=25.........................




[CV 1/5; 6/10] END .......learning_rate=0.5, n_estimators=25; total time=   1.6s
[CV 2/5; 6/10] START learning_rate=0.5, n_estimators=25.........................




[CV 2/5; 6/10] END .......learning_rate=0.5, n_estimators=25; total time=   1.5s
[CV 3/5; 6/10] START learning_rate=0.5, n_estimators=25.........................




[CV 3/5; 6/10] END .......learning_rate=0.5, n_estimators=25; total time=   1.3s
[CV 4/5; 6/10] START learning_rate=0.5, n_estimators=25.........................




[CV 4/5; 6/10] END .......learning_rate=0.5, n_estimators=25; total time=   1.4s
[CV 5/5; 6/10] START learning_rate=0.5, n_estimators=25.........................




[CV 5/5; 6/10] END .......learning_rate=0.5, n_estimators=25; total time=   1.3s
[CV 1/5; 7/10] START learning_rate=0.5, n_estimators=10.........................




[CV 1/5; 7/10] END .......learning_rate=0.5, n_estimators=10; total time=   1.0s
[CV 2/5; 7/10] START learning_rate=0.5, n_estimators=10.........................




[CV 2/5; 7/10] END .......learning_rate=0.5, n_estimators=10; total time=   1.1s
[CV 3/5; 7/10] START learning_rate=0.5, n_estimators=10.........................




[CV 3/5; 7/10] END .......learning_rate=0.5, n_estimators=10; total time=   0.9s
[CV 4/5; 7/10] START learning_rate=0.5, n_estimators=10.........................




[CV 4/5; 7/10] END .......learning_rate=0.5, n_estimators=10; total time=   0.9s
[CV 5/5; 7/10] START learning_rate=0.5, n_estimators=10.........................




[CV 5/5; 7/10] END .......learning_rate=0.5, n_estimators=10; total time=   1.0s
[CV 1/5; 8/10] START learning_rate=0.75, n_estimators=25........................




[CV 1/5; 8/10] END ......learning_rate=0.75, n_estimators=25; total time=   1.5s
[CV 2/5; 8/10] START learning_rate=0.75, n_estimators=25........................




[CV 2/5; 8/10] END ......learning_rate=0.75, n_estimators=25; total time=   1.4s
[CV 3/5; 8/10] START learning_rate=0.75, n_estimators=25........................




[CV 3/5; 8/10] END ......learning_rate=0.75, n_estimators=25; total time=   1.3s
[CV 4/5; 8/10] START learning_rate=0.75, n_estimators=25........................




[CV 4/5; 8/10] END ......learning_rate=0.75, n_estimators=25; total time=   1.8s
[CV 5/5; 8/10] START learning_rate=0.75, n_estimators=25........................




[CV 5/5; 8/10] END ......learning_rate=0.75, n_estimators=25; total time=   2.8s
[CV 1/5; 9/10] START learning_rate=0.75, n_estimators=10........................




[CV 1/5; 9/10] END ......learning_rate=0.75, n_estimators=10; total time=   0.9s
[CV 2/5; 9/10] START learning_rate=0.75, n_estimators=10........................




[CV 2/5; 9/10] END ......learning_rate=0.75, n_estimators=10; total time=   2.8s
[CV 3/5; 9/10] START learning_rate=0.75, n_estimators=10........................




[CV 3/5; 9/10] END ......learning_rate=0.75, n_estimators=10; total time=   0.9s
[CV 4/5; 9/10] START learning_rate=0.75, n_estimators=10........................




[CV 4/5; 9/10] END ......learning_rate=0.75, n_estimators=10; total time=   2.8s
[CV 5/5; 9/10] START learning_rate=0.75, n_estimators=10........................




[CV 5/5; 9/10] END ......learning_rate=0.75, n_estimators=10; total time=   1.4s
[CV 1/5; 10/10] START learning_rate=0.25, n_estimators=25.......................




[CV 1/5; 10/10] END .....learning_rate=0.25, n_estimators=25; total time=   1.4s
[CV 2/5; 10/10] START learning_rate=0.25, n_estimators=25.......................




[CV 2/5; 10/10] END .....learning_rate=0.25, n_estimators=25; total time=   1.4s
[CV 3/5; 10/10] START learning_rate=0.25, n_estimators=25.......................




[CV 3/5; 10/10] END .....learning_rate=0.25, n_estimators=25; total time=   1.5s
[CV 4/5; 10/10] START learning_rate=0.25, n_estimators=25.......................




[CV 4/5; 10/10] END .....learning_rate=0.25, n_estimators=25; total time=   3.9s
[CV 5/5; 10/10] START learning_rate=0.25, n_estimators=25.......................




[CV 5/5; 10/10] END .....learning_rate=0.25, n_estimators=25; total time=  12.6s




LightGBM without OHE
Best parameters: {'n_estimators': 25, 'learning_rate': 0.5}
RMSE: 1711.65
CPU times: user 18min 33s, sys: 10.5 s, total: 18min 43s
Wall time: 18min 53s


* Fitting the Model with the best selected parameters:

In [62]:
%%time
# random forest regressor with test data
# Best parameters: {'n_estimators': 25}
rf_test = RandomForestRegressor(random_state=42, n_estimators = 25)

start = time.time()
# fit with train and valid data
rf_test.fit(pd.concat([features_train, features_valid]), pd.concat([target_train, target_valid]))
end = time.time()
test_rftt = end - start

start = time.time()
test_pred = rf_test.predict(features_test)
end = time.time()
test_rfpt = end - start

test_rf_rmse_calc = mean_squared_error(target_test, test_pred)**0.5

print('Random Forest Regressor with test data')
print('RMSE:', test_rf_rmse_calc, 'Training time:', test_rftt, 'Prediction time:', test_rfpt)

Random Forest Regressor with test data
RMSE: 1610.9908923566247 Training time: 158.23585867881775 Prediction time: 1.2599332332611084
CPU times: user 2min 38s, sys: 913 ms, total: 2min 39s
Wall time: 2min 39s


In [63]:
%%time
# CatBoost with OHE with test data
# Best parameters: {'n_estimators': 25, 'learning_rate': 0.75}
cb_test = CatBoostRegressor(random_state=42, n_estimators=25, learning_rate=0.75)

start = time.time()
# use combined train and valid datasets to fit on
cb_test.fit(pd.concat([features_train, features_valid]), pd.concat([target_train, target_valid]))
end = time.time()
test_cbohett = end - start

start = time.time()
test_pred = cb_test.predict(features_test)
end = time.time()
test_cbohept = end - start

test_cbohe_rmse_calc = mean_squared_error(target_test, test_pred)**0.5

print('CatBoost with OHE with test data')
print('RMSE:', test_cbohe_rmse_calc, 'Training time:', test_cbohett, 'Prediction time:', test_cbohept)

0:	learn: 2898.0883795	total: 63ms	remaining: 1.51s
1:	learn: 2460.8946742	total: 102ms	remaining: 1.17s
2:	learn: 2297.6516730	total: 138ms	remaining: 1.01s
3:	learn: 2188.2078311	total: 175ms	remaining: 919ms
4:	learn: 2112.4989418	total: 214ms	remaining: 857ms
5:	learn: 2069.9129869	total: 250ms	remaining: 792ms
6:	learn: 2028.5056940	total: 289ms	remaining: 743ms
7:	learn: 2005.1229248	total: 321ms	remaining: 681ms
8:	learn: 1979.9296293	total: 360ms	remaining: 639ms
9:	learn: 1961.2899268	total: 397ms	remaining: 595ms
10:	learn: 1945.3311414	total: 435ms	remaining: 554ms
11:	learn: 1935.7377314	total: 468ms	remaining: 507ms
12:	learn: 1924.6830356	total: 507ms	remaining: 468ms
13:	learn: 1912.4148157	total: 549ms	remaining: 432ms
14:	learn: 1903.6343195	total: 591ms	remaining: 394ms
15:	learn: 1896.6559678	total: 627ms	remaining: 353ms
16:	learn: 1888.3898749	total: 668ms	remaining: 314ms
17:	learn: 1877.8195824	total: 709ms	remaining: 276ms
18:	learn: 1866.8376075	total: 751ms	re

In [64]:
%%time
# CatBoost without initial one hot encoding
# Best parameters: {'n_estimators': 25, 'learning_rate': 0.75}

start = time.time()
# use combined train and valid datasets to fit on
cb_test_2 = CatBoostRegressor(random_state=42, n_estimators=25, learning_rate=0.75)
cb_test_2.fit(pd.concat([features_train1, features_valid1]), pd.concat([target_train1, target_valid1]), cat_features=list_cat)
end = time.time()
test_cbtt = end - start

start = time.time()
test_pred = cb_test_2.predict(features_test1)
end = time.time()
test_cbpt = end - start

test_cb_rmse=mean_squared_error(target_test, test_pred)**0.5

print('CatBoost without OHE with test data')
print('RMSE:',test_cb_rmse, 'Training time:', test_cbtt, 'Prediction time:', test_cbpt) 

0:	learn: 2893.8572149	total: 112ms	remaining: 2.68s
1:	learn: 2425.4754092	total: 202ms	remaining: 2.32s
2:	learn: 2255.0814460	total: 298ms	remaining: 2.18s
3:	learn: 2172.5958363	total: 392ms	remaining: 2.06s
4:	learn: 2105.9718029	total: 470ms	remaining: 1.88s
5:	learn: 2061.8437014	total: 551ms	remaining: 1.74s
6:	learn: 2038.6008391	total: 637ms	remaining: 1.64s
7:	learn: 2001.3142850	total: 716ms	remaining: 1.52s
8:	learn: 1974.3535316	total: 798ms	remaining: 1.42s
9:	learn: 1955.5065248	total: 877ms	remaining: 1.31s
10:	learn: 1937.6088286	total: 961ms	remaining: 1.22s
11:	learn: 1917.9064598	total: 1.04s	remaining: 1.13s
12:	learn: 1894.4823853	total: 1.12s	remaining: 1.03s
13:	learn: 1882.8858174	total: 1.19s	remaining: 938ms
14:	learn: 1876.4799495	total: 1.27s	remaining: 848ms
15:	learn: 1865.7520487	total: 1.35s	remaining: 759ms
16:	learn: 1854.4658514	total: 1.42s	remaining: 670ms
17:	learn: 1848.1821082	total: 1.5s	remaining: 583ms
18:	learn: 1839.6111779	total: 1.57s	re

In [65]:
%%time
# lightGBM with OHE with test data
# Best parameters: {'n_estimators': 25, 'learning_rate': 0.5}
lgohe_test = lgb.LGBMRegressor(random_state=42, n_estimators=25, learning_rate=0.5)

start = time.time()
# fit model on combined data from train and valid
lgohe_test.fit(pd.concat([features_train, features_valid]), pd.concat([target_train, target_valid]))
end = time.time()
test_lgohett = end - start

start = time.time()
test_pred = lgohe_test.predict(features_test)
end = time.time()
test_lgohept = end - start

test_lgohe_rmse_calc = mean_squared_error(target_test, test_pred)**0.5

print('LightGBM with OHE with test data')
print('RMSE:', test_lgohe_rmse_calc, 'Training time:', test_lgohett, 'Prediction time:', test_lgohept)

LightGBM with OHE with test data
RMSE: 1733.4845830811937 Training time: 87.95521759986877 Prediction time: 0.30660486221313477
CPU times: user 1min 25s, sys: 1.7 s, total: 1min 27s
Wall time: 1min 28s


In [66]:
%%time
# lightGBM without OHE with test data
# Best parameters: {'n_estimators': 25, 'learning_rate': 0.25}
lg_test = lgb.LGBMRegressor(random_state=42, n_estimators=25, learning_rate=0.25)

start = time.time()
# fit on combined data from train and valid
lg_test.fit(pd.concat([features_train1, features_valid1]), pd.concat([target_train1, target_valid1]), categorical_feature=list_cat)
end = time.time()
test_lgtt = end - start

start = time.time()
test_pred = lg_test.predict(features_test1)
end = time.time()
test_lgpt = end - start

test_lg_rmse_calc = mean_squared_error(target_test1, test_pred)**0.5

print('LightGBM without OHE')
print('RMSE:', test_lg_rmse_calc, 'Training time:', test_lgtt, 'Prediction time:', test_lgpt)



LightGBM without OHE
RMSE: 1694.6792063955672 Training time: 86.89981365203857 Prediction time: 0.2788047790527344
CPU times: user 1min 25s, sys: 1.03 s, total: 1min 26s
Wall time: 1min 27s


In [67]:
# linear regression base model with test data
start = time.time()
predicted_test = lr_model.predict(features_test)
end = time.time()
test_lrpt = end - start

test_lr_rmse_calc = mean_squared_error(target_test, predicted_test)**0.5

print('Linear Regression - Sanity Check on test data')
print('RMSE:', test_lr_rmse_calc, 'Prediction time:', test_lrpt)

Linear Regression - Sanity Check on test data
RMSE: 2911.5822236815206 Prediction time: 0.40154361724853516


## Model analysis

In [68]:
print('Results with test data of base models\n')

print('Random Forest Regressor')
print('RMSE:', rf_rmse_calc, 'Training time:', rftt, 'Prediction time:', rfpt)
print('\nLightGBM with OHE')
print('RMSE:', lgohe_rmse_calc, 'Training time:', lgohett, 'Prediction time:', lgohept)
print('\nLightGBM without OHE')
print('RMSE:', lg_rmse_calc, 'Training time:', lgtt, 'Prediction time:', lgpt)
print('\nCatBoost with OHE')
print('RMSE:', cbohe_rmse_calc, 'Training time:', cbohett, 'Prediction time:', cbohept)
print('\nCatBoost without OHE')
print('RMSE:', cb_rmse_calc, 'Training time:', cbtt, 'Prediction time:', cbpt)
print('\nLinear Regression - Sanity Check')
print('RMSE:', lr_rmse_calc, 'Training time:', lrtt, 'Prediction time:', lrpt)


Results with test data of base models

Random Forest Regressor
RMSE: 1637.1681637261606 Training time: 446.7893626689911 Prediction time: 4.471585512161255

LightGBM with OHE
RMSE: 1726.6530504000457 Training time: 322.7245354652405 Prediction time: 0.7187302112579346

LightGBM without OHE
RMSE: 1661.1798804914488 Training time: 304.41242814064026 Prediction time: 0.6013398170471191

CatBoost with OHE
RMSE: 1639.9786314398411 Training time: 32.960219860076904 Prediction time: 0.16595196723937988

CatBoost without OHE
RMSE: 1635.5431243365203 Training time: 149.42547154426575 Prediction time: 0.37122368812561035

Linear Regression - Sanity Check
RMSE: 2895.0483751307856 Training time: 21.69653344154358 Prediction time: 0.3407108783721924


In [69]:

print('Results with test data of tuned models\n')

print('Random Forest Regressor with test data')
print('RMSE:', test_rf_rmse_calc, 'Training time:', test_rftt, 'Prediction time:', test_rfpt)

print('\nCatBoost with OHE with test data')
print('RMSE:', test_cbohe_rmse_calc, 'Training time:', test_cbohett, 'Prediction time:', test_cbohept)

print('\nCatBoost without OHE with test data')
print('RMSE:',test_cb_rmse, 'Training time:', test_cbtt, 'Prediction time:', test_cbpt)

print('\nLightGBM with OHE with test data')
print('RMSE:', test_lgohe_rmse_calc, 'Training time:', test_lgohett, 'Prediction time:', test_lgohept)

print('\nLightGBM without OHE')
print('RMSE:', test_lg_rmse_calc, 'Training time:', test_lgtt, 'Prediction time:', test_lgpt)

print('\nLinear Regression - Sanity Check on test data')
print('RMSE:', test_lr_rmse_calc, 'Prediction time:', test_lrpt)

Results with test data of tuned models

Random Forest Regressor with test data
RMSE: 1610.9908923566247 Training time: 158.23585867881775 Prediction time: 1.2599332332611084

CatBoost with OHE with test data
RMSE: 1822.563361305562 Training time: 3.082162857055664 Prediction time: 0.02696704864501953

CatBoost without OHE with test data
RMSE: 1793.5606078501316 Training time: 3.0702319145202637 Prediction time: 0.03883481025695801

LightGBM with OHE with test data
RMSE: 1733.4845830811937 Training time: 87.95521759986877 Prediction time: 0.30660486221313477

LightGBM without OHE
RMSE: 1694.6792063955672 Training time: 86.89981365203857 Prediction time: 0.2788047790527344

Linear Regression - Sanity Check on test data
RMSE: 2911.5822236815206 Prediction time: 0.40154361724853516


# Conclusion 

* The objective of this project was to ultimately find the speed required for both training and prediction of different models as well as to evaluate each model based on RMSE. 
* We initially used the linear regression model as a sanity check to compare with the other main models we trained. Hyperparameter Tuning did improve the Random Forest Regressor model in terms of resulting a lower RMSE. However, hyperparameter tuning also increase the time for predictions and training. 
* The LightGBM models that were not applied with One Hot Encoding resulted in the best RMSE values. Regardless of using One Hot Encoding, we observed that LighGBM resulted in the highest speed for prediction, while Catboost had the lowest speed. 
* Although tuning hyperparameters for the Random Forest Regressor model resulted in the lowest RMSE value, the prediction time was very high. 
* Overall, we conclude that the LightGBM without the application of One Hot encoding would be the best fit model for this purpose. It is the quickest and are still considerably low enough to be considered good quality.

# Checklist

Type 'x' to check. Then press Shift+Enter.

- [x]  Jupyter Notebook is open
- [ ]  Code is error free
- [ ]  The cells with the code have been arranged in order of execution
- [ ]  The data has been downloaded and prepared
- [ ]  The models have been trained
- [ ]  The analysis of speed and quality of the models has been performed