In [25]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os

from sklearn.compose import make_column_transformer, TransformedTargetRegressor
from sklearn.feature_extraction import FeatureHasher
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import set_config
import joblib


from time import perf_counter
from xgboost import XGBRegressor

In [4]:
%matplotlib inline
sns.set_theme()
set_config(display='diagram')
data_path = os.path.join("..", "data", "usedcars_dataset.csv")

In [5]:
def load_usedcars_data(data_path):
    df = pd.read_csv(data_path, sep=";", index_col="url")
    df = (df.astype({'Anno':'datetime64[ns]', 'date':'datetime64[ns]'})
        .assign(age_years= lambda x: (x['date'] - x['Anno']) / np.timedelta64(1, 'Y'))
        .dropna(subset=['price', 'potenza_kw', 'age_years', 'Chilometraggio'])
        .drop(columns=['Extra', 'Comfort', 'Sicurezza'])
        )
    return df

def filter_usedcars_data(df, max_price=1e6, min_price=100, max_age=40, max_cv=1000, max_km=1e6, max_engsize=1e4):
    df = df.query("~((age_years>@max_age or age_years < 0) \
                                or (potenza_cv > @max_cv or potenza_cv < 0) \
                                or (Chilometraggio>@max_km or Chilometraggio < 0) \
                                or (price > @max_price or price < @min_price ) \
                                or (Cilindrata_cm3 > @max_engsize or Cilindrata_cm3 < 0 ) \
                    )"
                )
    return df

In [6]:
df = load_usedcars_data(data_path)
df = filter_usedcars_data(df)
df_train, df_test = train_test_split(df, test_size=0.1)

In [7]:
numerical_columns = ['Chilometraggio',
                    'Cilindrata_cm3',
                    'Cilindri',
                    'Consumo_comb_L100km',
                    'Marce',
                    'Peso_a_vuoto_kg',
                    'Porte',
                    'Posti',
                    'age_years',
                    'potenza_cv'
                ]

categorical_ohe_columns = ['Carburante',
                            'Carrozzeria',
                            'Tipo_di_cambio',
                            'Tipo_di_veicolo',
                            'Trazione'
                        ]
                        
categorical_fh_columns = ['maker', 'model'] 

In [8]:
#Numerical features pipeline
num_transformer = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
    memory = 'cache'
)

#Categorical OneHotEncoding features pipeline
cat_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore'),
    memory = 'cache'
)

#Catgeorical Hashed features pipeline
hash_transformer_maker = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='Altro'),
    FeatureHasher(n_features=2**8, input_type='string'),
    memory = 'cache'
)

#Catgeorical Hashed features pipeline
hash_transformer_model = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='Altro'),
    FeatureHasher(n_features=2**11, input_type='string'),
    memory = 'cache'
)

# Preprocessing pipeline
preprocessor = make_column_transformer(
    (num_transformer, numerical_columns),
    (cat_transformer, categorical_ohe_columns),
    (hash_transformer_maker, ['maker']),
    (hash_transformer_model, ['model']),
    remainder = 'drop'
)

In [15]:
regressor_pipe = make_pipeline(
    preprocessor,
    XGBRegressor(
        objective='reg:absoluteerror'
        ),
    memory = 'cache'
    )

global_pipe = make_pipeline(
    TransformedTargetRegressor(
        regressor=regressor_pipe,
        func=np.log,
        inverse_func=np.exp
    ),
    memory = 'cache'
)

In [None]:
X_train = df_train.drop(columns=['price'])
y_train = df_train['price']

In [16]:
global_pipe.fit(X=X_train, y=y_train)

In [92]:
global_pipe.get_params()

{'memory': 'cache',
 'steps': [('transformedtargetregressor',
   TransformedTargetRegressor(func=<ufunc 'log'>, inverse_func=<ufunc 'exp'>,
                              regressor=Pipeline(memory='cache',
                                                 steps=[('columntransformer',
                                                         ColumnTransformer(transformers=[('pipeline-1',
                                                                                          Pipeline(memory='cache',
                                                                                                   steps=[('simpleimputer',
                                                                                                           SimpleImputer(strategy='median')),
                                                                                                          ('standardscaler',
                                                                                                           

In [17]:
y_pred = global_pipe.predict(X=X_train)

In [18]:
print(global_pipe.score(X=df_test.drop(columns=['price']), y=df_test['price']))
print(mean_absolute_error(y_train, y_pred))

0.8505011693932225
2990.094114208604


In [19]:
param_grid = dict(
    transformedtargetregressor__regressor__xgbregressor__n_estimators=[100, 200, 300],
    transformedtargetregressor__regressor__xgbregressor__eta=[0.3],
    transformedtargetregressor__regressor__xgbregressor__max_depth=[6],
    transformedtargetregressor__regressor__xgbregressor__gamma=[0.0],
    transformedtargetregressor__regressor__xgbregressor__reg_lambda=[1]
)
#transformedtargetregressor__regressor__xgbregressor__

grid_search = GridSearchCV(
    global_pipe, 
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=5
    )

In [20]:
grid_search.fit(X_train, y_train)

In [21]:
pd.DataFrame.from_dict(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_transformedtargetregressor__regressor__xgbregressor__eta,param_transformedtargetregressor__regressor__xgbregressor__gamma,param_transformedtargetregressor__regressor__xgbregressor__max_depth,param_transformedtargetregressor__regressor__xgbregressor__n_estimators,param_transformedtargetregressor__regressor__xgbregressor__reg_lambda,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,8.868398,1.750409,0.199879,0.038229,0.3,0.0,6,100,1,{'transformedtargetregressor__regressor__xgbre...,-3369.195097,-3359.676341,-3461.676082,-3439.176857,-3479.062476,-3421.75737,48.575022,1
1,17.106412,0.161781,0.216367,0.004917,0.3,0.0,6,200,1,{'transformedtargetregressor__regressor__xgbre...,-3369.194805,-3359.802035,-3444.60781,-3437.36376,-3548.625834,-3431.918849,67.744103,2
2,24.566151,0.34659,0.254915,0.019018,0.3,0.0,6,300,1,{'transformedtargetregressor__regressor__xgbre...,-3369.194514,-3360.192467,-3572.51703,-3436.788187,-3578.350715,-3463.408583,95.24164,3


In [24]:
grid_search.best_params_

{'transformedtargetregressor__regressor__xgbregressor__eta': 0.3,
 'transformedtargetregressor__regressor__xgbregressor__gamma': 0.0,
 'transformedtargetregressor__regressor__xgbregressor__max_depth': 6,
 'transformedtargetregressor__regressor__xgbregressor__n_estimators': 100,
 'transformedtargetregressor__regressor__xgbregressor__reg_lambda': 1}

In [27]:
out_model_file = os.path.join("models", "XGB_CV.pkl")
joblib.dump(grid_search.best_estimator_, out_model_file)

['models\\XGB_CV.pkl']