# 4. Modelling

## 4.1 Importing useful libraries & functions

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import pandas as pd
import numpy as np

# processing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# modelling
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.regularizers import L2
# evaluating
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
tf.config.list_physical_devices('GPU')

## 4.2 Reading data

In [None]:
cars = pd.read_csv('../data/clean_cars.csv')

## 4.3 Train, CV, Test split

We will divide our data into three datasets. The **training** dataset will teach the model how to operate, the **cross-validation (CV)** dataset will help us optimize the parameters of the models, and finally the **test** dataset will be used to evaluate our final mode and estimate its error and accuracy.

In [None]:
X = cars.drop('price',axis=1)
y = cars['price']

# Creating train dataset
X_train, X_, y_train, y_ = train_test_split(X, y, test_size=0.4)
X_train.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)

# Creating test & cv dataset
X_cv, X_test, y_cv, y_test = train_test_split(X_, y_, test_size=0.5)
X_cv.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)
y_cv.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

I split the features into numerical and categorical, so I can later normalize and one-hot encode.

In [None]:
# 1. Training dataset
X_num = X_train.select_dtypes(np.number)
X_cat = X_train.select_dtypes(object)

# 2. CV dataset
X_num_cv = X_cv.select_dtypes(np.number)
X_cat_cv = X_cv.select_dtypes(object)

# 3. Test dataset
X_num_test = X_test.select_dtypes(np.number)
X_cat_test = X_test.select_dtypes(object)

## 4.4 Normalizing numerical features

In [None]:
def std_scale(numericals, transformer):
    normalized = []
    for X_numerical in numericals:
        X_normalized = transformer.transform(X_numerical)
        X_normalized = pd.DataFrame(X_normalized, columns=X_numerical.columns)
        normalized.append(X_normalized)
    return normalized

In [None]:
# Defining normalizer
transformer = StandardScaler().fit(X_num)

# Applying normalizer
X_norm, X_norm_cv, X_norm_test = std_scale([X_num,X_num_cv,X_num_test], transformer)

## 4.5 Encoding categorical features

In [None]:
def one_hot(categoricals, encoder):
    onehots = []
    for X_categorical in categoricals:
        encoded = encoder.transform(X_categorical).toarray()
        onehot_encode = pd.DataFrame(encoded,columns=encoder.get_feature_names_out(X_categorical.columns))
        onehots.append(onehot_encode)
    return onehots

In [None]:
# Creating encoder
encoder = OneHotEncoder(handle_unknown='error',drop='first').fit(X_cat)

# Applying onehot-encode
X_oh, X_oh_cv, X_oh_test = one_hot([X_cat,X_cat_cv,X_cat_test], encoder)

## 4.6 Concatenating back

In [None]:
X_train_scaled = pd.concat([X_norm, X_oh], axis=1)
X_cv_scaled = pd.concat([X_norm_cv, X_oh_cv], axis=1)
X_test_scaled = pd.concat([X_norm_test, X_oh_test], axis=1)

In [None]:
X_oh

## 4.7 Training models & evaluation

In [None]:
lmbd = 0
model1 = LinearRegression()
model2 = KNeighborsRegressor()
model3 = DecisionTreeRegressor()
model4 = RandomForestRegressor()
model5 = XGBRegressor()
model6 = Sequential(
[
    tf.keras.layers.Dense(2048, activation = 'relu', name='L1', kernel_regularizer=L2(lmbd)),
    tf.keras.layers.Dense(2048, activation = 'relu', name='L2', kernel_regularizer=L2(lmbd)),
    tf.keras.layers.Dense(1, activation = 'relu', name='L7'),

]
)
model6.compile(loss = MeanSquaredError(),optimizer = tf.keras.optimizers.Adam(learning_rate=0.01))
model_pipeline = [model1,  model4, model6, model5,]
model_names = ['Linear Regression', 'RandomForest', 'NeuralNetwork' ,'XGBoost']

scores = {}
for model, model_name in zip(model_pipeline, model_names):
    print('Working with model '+model_name)
    
    # Fitting the model
    if model_name == 'NeuralNetwork':
        model.fit(X_train_scaled, y_train, epochs = 30)
    else:
        model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_train_scaled)
    training_error = r2_score(y_pred,y_train)
    
    y_pred_cv = model.predict(X_cv_scaled)
    cv_error = r2_score(y_pred_cv,y_cv)
    scores[model_name] = [round(training_error,3), round(cv_error,3)]
    
print(scores)
# We can use the result to choose the best performing model

## 4.7 Hyperparameter search

### 4.7.1 Random Forest

In [None]:
# from sklearn.model_selection import RandomizedSearchCV

max_depth_choices= [5,10,None]
criterion_choices = ['squared_error','absolute_error']
min_samples_split_choices = [4,10]
min_samples_leaf_choices = [4,10]

random_grid = {'max_depth': max_depth_choices,
'criterion': criterion_choices,
'min_samples_split': min_samples_split_choices,
'min_samples_leaf': min_samples_leaf_choices}

# Instantiate the grid search model object
# estimator -> model to optimize
model = RandomForestRegressor()

#and now more realistic
model = DecisionTreeRegressor()
random_search = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter=12, cv = 5, n_jobs = 8)

random_search.fit(X_train_scaled,y_train)

In [None]:
random_search.best_params_

In [None]:
print(f'The best score (R2={random_search.best_score_:.2f}) corresponds to the parameters {random_search.best_params_}.')

The best score (R2=0.94) corresponds to the parameters {'min_samples_split': 4, 'min_samples_leaf': 4, 'max_depth': None, 'criterion': 'squared_error'}.

In [None]:
# and the winner is...
grid_search.best_params_

In [None]:
# in grid search you are more likely to get really good results in your training set, even with CV
grid_search.best_score_

In [None]:
np.round(y_pred_cv,0).flatten()

In [None]:
error =int((abs(np.round(y_pred_cv,0).flatten()-y_cv)/np.round(y_pred_cv,0).flatten()*100).astype(int).values.mean())
std = int((abs(np.round(y_pred_cv,0).flatten()-y_cv)/np.round(y_pred_cv,0).flatten()*100).astype(int).values.std())
print('The error is ',error,'±',2*std)

In [None]:
# <!> I could try to use log or sqrt of some qualities like cv and more that seem to over emphasize 
# the price of high values.

In [None]:
(np.round(y_pred_cv,0).flatten()-y_cv).astype(int).values[abs(np.round(y_pred_cv,0).flatten()-y_cv).astype(int).values>6000].mean()

In [None]:
(X_cv[abs(np.round(y_pred_cv,0).flatten()-y_cv).astype(int).values>6000]).describe().T

In [None]:
(X_cv[abs(np.round(y_pred_cv,0)-y_cv).astype(int).values<6000]).describe().T

In [None]:
abs(np.round(y_pred_cv,0)-y_cv).astype(int).values

In [None]:
ca

In [None]:
a = 'marchas'

In [None]:
round(len(X_cv[X_cv[a]==0.])/X_cv.shape[0]*100,3)

In [None]:
round(len(X_train[X_train[a]==0.])/X_train.shape[0]*100,3)

In [None]:
X_cv[a]==0