# 4. Modelling

## 4.1 Importing useful libraries & functions

In [38]:
import os
import pandas as pd
import numpy as np
import pickle

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

# processing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# modelling
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.regularizers import L2
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
# evaluating
from sklearn.metrics import mean_squared_error, r2_score

## 4.2 Reading data

In [39]:
cars = pd.read_csv('../data/clean_cars_id.csv')

In [40]:
cars

Unnamed: 0,year,cv,km,fuel,gearbox,color,id,brand,price,cmixto,class,location,area
0,6,326,94000,Gasoline,Automatic,White,6830673,BMW,35900,6.6,Sport,La Rioja,84912
1,2,252,29187,Gasoline,Manual,White,7477502,Other,65450,6.4,Sport,La Rioja,75240
2,4,150,40012,Gasoline,Manual,Red,7550291,Renault,20950,5.6,4x4,La Rioja,73336
3,14,224,329000,Diesel,Automatic,Black,7593673,Mercedes,10900,7.9,4x4,La Rioja,84258
4,1,90,5450,Gasoline,Manual,Orange,7594227,Renault,23450,5.3,4x4,La Rioja,76140
...,...,...,...,...,...,...,...,...,...,...,...,...,...
285217,6,180,51000,Diesel,Automatic,Other,7381869,Jaguar,36100,4.6,Standard,Valencia,98704
285218,5,150,112399,Diesel,Automatic,Other,7381109,Audi,28700,4.2,Standard,Valencia,87032
285219,6,116,62241,Gasoline,Automatic,Other,7381085,Audi,18400,4.6,Standard,Valencia,80280
285220,7,69,49137,Gasoline,Manual,Other,7381063,Toyota,11155,4.3,Standard,Valencia,66130


## 4.3 Train, Test split

We will divide our data into three datasets. The **training** dataset will teach the model how to operate andthrough **cross-validation (CV)** dataset will help us optimize the parameters of the models, and finally the **test** dataset will be used to evaluate our final mode and estimate its error and accuracy.

In [41]:
# For the test-train split I can't use a predefined function randomly, because then I would be 
# using some duplicates in both the the test and train dataset. I'll use the following code instead.

# Choosing 5000 unique cars for the test dataset
random_ids = list(np.random.permutation(len(cars.id.unique())))
test_id_index = random_ids[0:5000]
test_ids = cars.id.unique()[test_id_index]

In [42]:
train = cars[~cars.id.isin(test_ids)].reset_index(drop=True)
test = cars[cars.id.isin(test_ids)].reset_index(drop=True)

In [43]:
X_train = train.drop(['price','id'],axis=1)
y_train = train['price']
X_test = test.drop(['price','id'],axis=1)
y_test = test['price']

I split the features into numerical and categorical, so I can later normalize and one-hot encode.

In [44]:
# 1. Training dataset
X_num = X_train.select_dtypes(np.number)
X_cat = X_train.select_dtypes(object)

# 2. Test dataset
X_num_test = X_test.select_dtypes(np.number)
X_cat_test = X_test.select_dtypes(object)

## 4.4 Normalizing numerical features

In [46]:
def std_scale(numericals, transformer):
    normalized = []
    for X_numerical in numericals:
        X_normalized = transformer.transform(X_numerical)
        X_normalized = pd.DataFrame(X_normalized, columns=X_numerical.columns)
        normalized.append(X_normalized)
    return normalized

In [49]:
# Defining normalizer
transformer = StandardScaler().fit(X_num)

# Applying normalizer
X_norm, X_norm_test = std_scale([X_num,X_num_test], transformer)

## 4.5 Encoding categorical features

In [51]:
def one_hot(categoricals, encoder):
    onehots = []
    for X_categorical in categoricals:
        encoded = encoder.transform(X_categorical).toarray()
        onehot_encode = pd.DataFrame(encoded,columns=encoder.get_feature_names_out(X_categorical.columns))
        onehots.append(onehot_encode)
    return onehots

In [52]:
# Creating encoder
encoder = OneHotEncoder(handle_unknown='error',drop='first').fit(X_cat)

# Applying onehot-encode
X_oh, X_oh_test = one_hot([X_cat,X_cat_test], encoder)

## 4.6 Concatenating back

In [53]:
X_train_scaled = pd.concat([X_norm, X_oh], axis=1)
X_test_scaled = pd.concat([X_norm_test, X_oh_test], axis=1)

## 4.7 First-glance at models

#### Linear regression, KNN, Random Forest, XGBoost

In [None]:
model1 = LinearRegression()
model2 = KNeighborsRegressor()
model3 = RandomForestRegressor()
model4 = XGBRegressor()

model_pipeline = [model1, model2, model3, model4]
model_names = ['Linear Regression','KNN','Random Forest', 'XGBoost']

scores = {}

for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train_scaled, y_train, cv=5))
    scores[model_name] = mean_score
    
print(scores)

{'Linear Regression': 0.798, 'KNN': 0.932, 'Random Forest': 0.964, 'XGBoost': 0.9414194378541989}

#### Neural Network

In [None]:
# Define the K-fold Cross Validator
kfold = KFold(n_splits=2, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1
scores = []

for cv_train, cv_test in kfold.split(X_train_scaled, y_train):
    model = Sequential(
    [
        tf.keras.layers.Dense(64, activation = 'relu', name='L4'),
        tf.keras.layers.Dense(1, activation = 'relu', name='L5'),

    ])
    model.compile(loss = MeanSquaredError(), optimizer = tf.keras.optimizers.Adam(learning_rate=0.01))
        
    # Generate a print
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')
    
    model.fit(X_train_scaled.loc[cv_train,:], y_train.loc[cv_train], epochs = 5)

    # Generate generalization metrics
    y_pred = model.predict(X_train_scaled.loc[cv_test,:])
    training_error = r2_score(y_pred,y_train.loc[cv_test])
    
    print(f'R2 for fold {fold_no}: {training_error:.3f}')
    scores.append(training_error)

    # Increase fold number
    fold_no = fold_no + 1
print(f'The CV R2 score is {np.mean(scores):.3f}.')

In [None]:
# Define the K-fold Cross Validator
kfold = KFold(n_splits=2, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1
scores = []

for cv_train, cv_test in kfold.split(X_train_scaled, y_train):
    model = Sequential(
    [
        tf.keras.layers.Dense(128, activation = 'relu', name='L1'),
        tf.keras.layers.Dense(64, activation = 'relu', name='L4'),
        tf.keras.layers.Dense(1, activation = 'relu', name='L5'),

    ])
    model.compile(loss = MeanSquaredError(), optimizer = tf.keras.optimizers.Adam(learning_rate=0.01))
        
    # Generate a print
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')
    
    model.fit(X_train_scaled.loc[cv_train,:], y_train.loc[cv_train], epochs = 5)

    # Generate generalization metrics
    y_pred = model.predict(X_train_scaled.loc[cv_test,:])
    training_error = r2_score(y_pred,y_train.loc[cv_test])
    
    print(f'R2 for fold {fold_no}: {training_error:.3f}')
    scores.append(training_error)

    # Increase fold number
    fold_no = fold_no + 1
print(f'The CV R2 score is {np.mean(scores):.3f}.')

In [None]:
# Define the K-fold Cross Validator
kfold = KFold(n_splits=3, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1
scores = []

for cv_train, cv_test in kfold.split(X_train_scaled, y_train):
    model = Sequential(
    [
        tf.keras.layers.Dense(256, activation = 'relu', name='L1'),
        tf.keras.layers.Dense(128, activation = 'relu', name='L2'),
        tf.keras.layers.Dense(64, activation = 'relu', name='L4'),
        tf.keras.layers.Dense(1, activation = 'relu', name='L5'),

    ])
    model.compile(loss = MeanSquaredError(), optimizer = tf.keras.optimizers.Adam(learning_rate=0.01))
        
    # Generate a print
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')
    
    model.fit(X_train_scaled.loc[cv_train,:], y_train.loc[cv_train], epochs = 5)

    # Generate generalization metrics
    y_pred = model.predict(X_train_scaled.loc[cv_test,:])
    training_error = r2_score(y_pred,y_train.loc[cv_test])
    
    print(f'R2 for fold {fold_no}: {training_error:.3f}')
    scores.append(training_error)

    # Increase fold number
    fold_no = fold_no + 1
print(f'The CV R2 score is {np.mean(scores):.3f}.')

## 4.8 Hyperparameter optimization

## 4.7 Training models & evaluation

In [None]:
lmbd = 0
model1 = LinearRegression()
model2 = KNeighborsRegressor()
model3 = DecisionTreeRegressor()
model4 = RandomForestRegressor()
model5 = XGBRegressor()
model6 = Sequential(
[
    tf.keras.layers.Dense(2048, activation = 'relu', name='L1', kernel_regularizer=L2(lmbd)),
    tf.keras.layers.Dense(2048, activation = 'relu', name='L2', kernel_regularizer=L2(lmbd)),
    tf.keras.layers.Dense(1, activation = 'relu', name='L7'),

]
)
model6.compile(loss = MeanSquaredError(),optimizer = tf.keras.optimizers.Adam(learning_rate=0.01))
model_pipeline = [model1,  model4, model6, model5,]
model_names = ['Linear Regression', 'RandomForest', 'NeuralNetwork' ,'XGBoost']

scores = {}
for model, model_name in zip(model_pipeline, model_names):
    print('Working with model '+model_name)
    
    # Fitting the model
    if model_name == 'NeuralNetwork':
        model.fit(X_train_scaled, y_train, epochs = 30)
    else:
        model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_train_scaled)
    training_error = r2_score(y_pred,y_train)
    
    scores[model_name] = [round(training_error,3)]
    
print(scores)
# We can use the result to choose the best performing model

## 4.7 Hyperparameter search

### 4.7.1 Random Forest

In [None]:
# from sklearn.model_selection import RandomizedSearchCV

max_depth_choices= [5,10,None]
min_samples_split_choices = [2,12]
min_samples_leaf_choices = [1,6]

random_grid = {'max_depth': max_depth_choices,
'min_samples_split': min_samples_split_choices,
'min_samples_leaf': min_samples_leaf_choices}

# Instantiate the grid search model object
# estimator -> model to optimize
model = RandomForestRegressor()

random_search = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter=12, cv = 5, n_jobs = 8)

random_search.fit(X_train_scaled,y_train)

### 4.7.2 XGBoost

In [None]:
# from sklearn.model_selection import RandomizedSearchCV

param_grid = {
        'max_depth': [3, 7, 11, 13, 15, 17, None],
        'min_child_weight': np.arange(0.0001, 0.5, 0.05),
        'learning_rate': np.arange(0.0005,0.3,0.025),
        'subsample': np.arange(0.01,1.0,0.005),
        'colsample_bylevel': np.round(np.arange(0.1,1.0,0.05)),
        'colsample_bytree': np.arange(0.1,1.0,0.05),
}

# Instantiate the grid search model object
# estimator -> model to optimize
model = XGBRegressor()

random_search = RandomizedSearchCV(estimator = model, param_distributions = param_grid, n_iter=100, cv = 5, n_jobs = 8)

random_search.fit(X_train_scaled,y_train)
#random_search.best_params_
#random_search.best_score_

# 4.8 Model training & testing

In [54]:
model = XGBRegressor(**{'subsample': 0.935,'min_child_weight': 0.2001,'max_depth': 17,
        'learning_rate': 0.0755,'colsample_bytree': 0.7,'colsample_bylevel': 1.0})
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
test_error = r2_score(y_pred,y_test)
test_error

0.9365562837039167

In [57]:
rmse= np.sqrt(mean_squared_error(y_test,y_pred))
error =int((abs(np.round(y_pred,0).flatten()-y_test)/np.round(y_pred,0).flatten()*100).astype(int).values.mean())
std = int((abs(np.round(y_pred,0).flatten()-y_test)/np.round(y_pred,0).flatten()*100).astype(int).values.std())
print('The RMSE is',round(rmse,1))
print('The error for 90% of the test samples is equal or lower than',abs(error)+abs(std)*2,'% of the price.')

The RMSE is 2577.9
The error for 90% of the test samples is equal or lower than 25 % of the price.


In [22]:
# <!> I could try to use log or sqrt of some qualities like cv and more that seem to over emphasize 
# the price of high values.

In [61]:
pickle.dump(encoder, open('../machine-learning/encoder.p','wb'))

In [62]:
pickle.dump(transformer, open('../machine-learning/normalizer.p','wb'))

In [63]:
pickle.dump(model, open('../machine-learning/xgboost.p','wb'))