# 3. Modelling

## 3.1 Importing useful libraries & functions

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import pandas as pd
import numpy as np

# processing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# modelling
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.regularizers import L2
# evaluating
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## 3.2 Reading data

In [4]:
cars = pd.read_csv('../data/cleaned_cars.csv')

In [5]:
cars

Unnamed: 0,year,cv,km,fuel,doors,gearbox,emissions,color,brand,price,...,seats,max_sp,cmixto,0-100,cilindrada,cilindros,max_par,marchas,class,location
0,6,85,38300,gasolina,5,manual,114,gris,ford,8890,...,5,168,4.9,13.6,1242,4 inline,112,5,common,madrid
1,16,140,224820,diesel,4,manual,148,gris,honda,4990,...,5,212,5.5,9.3,2204,4 inline,340,6,common,madrid
2,4,100,40139,diesel,5,manual,112,blanco,citroen,18990,...,5,175,4.0,12.3,1499,4 inline,250,5,common,madrid
3,14,105,217000,diesel,5,manual,135,gris,renault,4200,...,7,180,5.3,13.4,1461,4 inline,240,6,common,madrid
4,8,130,128211,gasolina,5,manual,123,blanco,peugeot,10990,...,5,199,5.3,10.8,1199,more4 inline,230,6,common,madrid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30283,5,75,170000,diesel,5,manual,123,blanco,renault,8900,...,5,151,4.9,17.5,1461,4 inline,180,5,commercial,sevilla
30284,5,90,114200,diesel,5,manual,112,blanco,renault,15000,...,5,160,4.3,13.3,1461,4 inline,200,5,commercial,sevilla
30285,4,75,102102,diesel,5,manual,123,rojo,renault,15470,...,5,151,4.9,17.5,1461,4 inline,180,5,commercial,sevilla
30286,8,75,86000,diesel,5,manual,130,blanco,renault,13270,...,5,151,4.9,17.5,1461,4 inline,180,5,commercial,sevilla


## 3.3 Train, CV, Test split

We will divide our data into three datasets. The **training** dataset will teach the model how to operate, the **cross-validation (CV)** dataset will help us optimize the parameters of the models, and finally the **test** dataset will be used to evaluate our final mode and estimate its error and accuracy.

In [5]:
X = cars.drop('price',axis=1)
y = cars['price']

# Creating train dataset
X_train, X_, y_train, y_ = train_test_split(X, y, test_size=0.4)
X_train.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)

# Creating test & cv dataset
X_cv, X_test, y_cv, y_test = train_test_split(X_, y_, test_size=0.5)
X_cv.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)
y_cv.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

I split the features into numerical and categorical, so I can later normalize and one-hot encode.

In [6]:
# 1. Training dataset
X_num = X_train.select_dtypes(np.number)
X_cat = X_train.select_dtypes(object)

# 2. CV dataset
X_num_cv = X_cv.select_dtypes(np.number)
X_cat_cv = X_cv.select_dtypes(object)

# 3. Test dataset
X_num_test = X_test.select_dtypes(np.number)
X_cat_test = X_test.select_dtypes(object)

## 3.4 Normalizing numerical features

In [7]:
def std_scale(numericals, transformer):
    normalized = []
    for X_numerical in numericals:
        X_normalized = transformer.transform(X_numerical)
        X_normalized = pd.DataFrame(X_normalized, columns=X_numerical.columns)
        normalized.append(X_normalized)
    return normalized

In [8]:
# Defining normalizer
transformer = StandardScaler().fit(X_num)

# Applying normalizer
X_norm, X_norm_cv, X_norm_test = std_scale([X_num,X_num_cv,X_num_test], transformer)

## 3.5 Encoding categorical features

In [9]:
def one_hot(categoricals, encoder):
    onehots = []
    for X_categorical in categoricals:
        encoded = encoder.transform(X_categorical).toarray()
        onehot_encode = pd.DataFrame(encoded,columns=encoder.get_feature_names_out(X_categorical.columns))
        onehots.append(onehot_encode)
    return onehots

In [10]:
# Creating encoder
encoder = OneHotEncoder(handle_unknown='error',drop='first').fit(X_cat)

# Applying onehot-encode
X_oh, X_oh_cv, X_oh_test = one_hot([X_cat,X_cat_cv,X_cat_test], encoder)

## 3.6 Concatenating back

In [11]:
X_train_scaled = pd.concat([X_norm, X_oh], axis=1)
X_cv_scaled = pd.concat([X_norm_cv, X_oh_cv], axis=1)
X_test_scaled = pd.concat([X_norm_test, X_oh_test], axis=1)

In [12]:
X_oh

Unnamed: 0,fuel_gasolina,fuel_híbrido,gearbox_manual,color_azul,color_beige,color_blanco,color_gris,color_marron,color_naranja,color_negro,...,cilindros_less4 inline,cilindros_more4 inline,cilindros_more6 V,cilindros_other,class_commercial,class_common,class_sport,location_madrid,location_sevilla,location_valencia
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18167,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
18168,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18169,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
18170,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


## 3.7 Training models & evaluation

In [37]:
lmbd = 5e3
model1 = LinearRegression()
model2 = KNeighborsRegressor()
model3 = DecisionTreeRegressor()
model4 = RandomForestRegressor()
model5 = XGBRegressor()
model6 = Sequential(
[
    tf.keras.layers.Dense(1024, activation = 'relu', name='L1', kernel_regularizer=L2(lmbd)),
    tf.keras.layers.Dense(1024, activation = 'relu', name='L12', kernel_regularizer=L2(lmbd)),
    tf.keras.layers.Dense(512, activation = 'relu', name='L13', kernel_regularizer=L2(lmbd)),
    tf.keras.layers.Dense(256, activation = 'relu', name='L133', kernel_regularizer=L2(lmbd)),
    tf.keras.layers.Dense(256, activation = 'relu', name='L2', kernel_regularizer=L2(lmbd)),
    tf.keras.layers.Dense(256, activation = 'relu', name='L23', kernel_regularizer=L2(lmbd)),
    tf.keras.layers.Dense(128, activation = 'relu', name='L24', kernel_regularizer=L2(lmbd)),
    tf.keras.layers.Dense(1, activation = 'relu', name='L7'),

]
)
model6.compile(loss = MeanSquaredError(),optimizer = tf.keras.optimizers.Adam(learning_rate=0.01))
model_pipeline = [model1, model2, model3, model4, model5, model6]
model_names = ['Linear Regression', 'KNN','Decision Tree Regressor', 'RandomForest', 'XGBoost', 'NeuralNetwork']

scores = {}
for model, model_name in zip(model_pipeline, model_names):
    print('Working with model '+model_name)
    
    # Fitting the model
    if model_name == 'NeuralNetwork':
        model.fit(X_train_scaled, y_train, epochs = 100)
    else:
        model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_train_scaled)
    training_error = r2_score(y_pred,y_train)
    
    y_pred_cv = model.predict(X_cv_scaled)
    cv_error = r2_score(y_pred_cv,y_cv)
    scores[model_name] = [round(training_error,3), round(cv_error,3)]
    
print(scores)
# We can use the result to choose the best performing model

Working with model Linear Regression
Working with model KNN
Working with model Decision Tree Regressor
Working with model RandomForest
Working with model XGBoost
Working with model NeuralNetwork


ValueError: Please provide as model inputs either a single array or a list of arrays. You passed: inputs=           year        cv        km     doors  emissions      boot    length  \
0      0.093801  3.810176  0.979702  0.370750   2.171246  0.836207  0.923184   
1     -1.165561  0.177964 -0.354571  0.370750   0.061594  0.118117  0.252058   
2      0.408641  2.270252 -0.098366  0.370750   2.171246  1.051634  2.009768   
3     -1.165561 -0.290709 -1.077309  0.370750  -0.273271 -0.025501  0.092266   
4      0.723482  0.629898  1.318768  0.370750  -0.474190 -0.090129  0.156183   
...         ...       ...       ...       ...        ...       ...       ...   
18167 -1.480402 -0.324185 -1.388785  0.370750  -0.608137  0.161203  0.763392   
18168  1.353162 -0.491568  1.166394  0.370750   0.162054 -0.240927 -0.131442   
18169 -0.221040 -0.156802 -0.139662  0.370750  -0.139325  0.620780  0.252058   
18170  1.038322 -0.324185  0.047047  0.370750   0.162054 -0.880027 -1.313902   
18171  3.871886 -0.491568  0.799738 -2.588084   1.066190 -1.835086 -2.048944   

         height     width     seats  ...  cilindros_less4 inline  \
0      0.800247  1.634302  0.029011  ...                     0.0   
1     -0.239919  0.406515  0.029011  ...                     0.0   
2      1.273050  2.179986  0.029011  ...                     0.0   
3     -0.050798 -0.139168  0.029011  ...                     0.0   
4     -0.523601 -0.139168  0.029011  ...                     0.0   
...         ...       ...       ...  ...                     ...   
18167  1.273050  0.952198  0.029011  ...                     0.0   
18168  0.705687 -0.412010  0.029011  ...                     0.0   
18169  0.705687  0.406515  0.029011  ...                     0.0   
18170 -0.807282 -0.957693  0.029011  ...                     0.0   
18171  0.705687  0.270094 -1.827294  ...                     0.0   

       cilindros_more4 inline  cilindros_more6 V  cilindros_other  \
0                         0.0                0.0              0.0   
1                         0.0                0.0              0.0   
2                         0.0                0.0              0.0   
3                         0.0                0.0              0.0   
4                         0.0                0.0              0.0   
...                       ...                ...              ...   
18167                     0.0                0.0              0.0   
18168                     0.0                0.0              0.0   
18169                     1.0                0.0              0.0   
18170                     0.0                0.0              0.0   
18171                     0.0                0.0              0.0   

       class_commercial  class_common  class_sport  location_madrid  \
0                   0.0           0.0          0.0              1.0   
1                   0.0           0.0          0.0              1.0   
2                   0.0           0.0          0.0              1.0   
3                   0.0           0.0          0.0              0.0   
4                   0.0           0.0          0.0              0.0   
...                 ...           ...          ...              ...   
18167               0.0           0.0          0.0              1.0   
18168               0.0           0.0          0.0              0.0   
18169               0.0           0.0          0.0              1.0   
18170               0.0           1.0          0.0              0.0   
18171               0.0           1.0          0.0              0.0   

       location_sevilla  location_valencia  
0                   0.0                0.0  
1                   0.0                0.0  
2                   0.0                0.0  
3                   1.0                0.0  
4                   0.0                1.0  
...                 ...                ...  
18167               0.0                0.0  
18168               0.0                0.0  
18169               0.0                0.0  
18170               0.0                1.0  
18171               0.0                0.0  

[18172 rows x 81 columns]

In [None]:
np.round(y_pred_cv,0).flatten()

In [None]:
error =int((abs(np.round(y_pred_cv,0).flatten()-y_cv)/np.round(y_pred_cv,0).flatten()*100).astype(int).values.mean())
std = int((abs(np.round(y_pred_cv,0).flatten()-y_cv)/np.round(y_pred_cv,0).flatten()*100).astype(int).values.std())
print('The error is ',error,'±',4*std)

In [None]:
# <!> I could try to use log or sqrt of some qualities like cv and more that seem to over emphasize 
# the price of high values.

In [None]:
(np.round(y_pred_cv,0).flatten()-y_cv).astype(int).values[abs(np.round(y_pred_cv,0).flatten()-y_cv).astype(int).values>6000].mean()

In [None]:
(X_cv[abs(np.round(y_pred_cv,0).flatten()-y_cv).astype(int).values>6000]).describe().T

In [None]:
(X_cv[abs(np.round(y_pred_cv,0)-y_cv).astype(int).values<6000]).describe().T

In [None]:
abs(np.round(y_pred_cv,0)-y_cv).astype(int).values

In [None]:
a = 'marchas'

In [None]:
round(len(X_cv[X_cv[a]==0.])/X_cv.shape[0]*100,3)

In [None]:
round(len(X_train[X_train[a]==0.])/X_train.shape[0]*100,3)

In [None]:
X_cv[a]==0

array([        1,       100,     10000,    250000,   1000000,   4000000,
         9000000,  16000000,  25000000,  64000000, 100000000])