In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
from datetime import datetime
from dotenv import load_dotenv
import regex as re

#from comet_ml import Experiment
from sklearn.compose import make_column_transformer, TransformedTargetRegressor
from sklearn.feature_extraction import FeatureHasher
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import set_config
import joblib
from xgboost import XGBRegressor

In [3]:
%matplotlib inline
sns.set_theme()
set_config(display='diagram')
data_path = os.path.join("..", "data", "usedcars_dataset.csv")

## Loading Data

In [34]:
def load_usedcars_data(data_path):
    usecolumns = ['url',
                    'Anno',
                    'date',
                    'Chilometraggio',
                    'Cilindrata_cm3',
                    'Cilindri',
                    'Consumo_comb_L100km',
                    'Marce',
                    'Peso_a_vuoto_kg',
                    'Porte',
                    'Posti',
                    'potenza_cv',
                    'Carburante',
                    'Carrozzeria',
                    'Tipo_di_cambio',
                    'Tipo_di_veicolo',
                    'Trazione',
                    'maker',
                    'model',
                    'price'
                 ]
    df = pd.read_csv(data_path, sep=";", index_col="url", usecols=usecolumns)
    df = (df.astype({'Anno':'datetime64[ns]', 'date':'datetime64[ns]'})
        .assign(age_years= lambda x: (x['date'] - x['Anno']) / np.timedelta64(1, 'Y'))
        .dropna(subset=['price', 'potenza_cv', 'age_years', 'Chilometraggio'])
        )
    return df

def filter_usedcars_data(df, max_price=1e6, min_price=100, max_age=40, max_cv=1000, max_km=1e6, max_engsize=1e4):
    df = df.query("~((age_years>@max_age or age_years < 0) \
                                or (potenza_cv > @max_cv or potenza_cv < 0) \
                                or (Chilometraggio>@max_km or Chilometraggio < 0) \
                                or (price > @max_price or price < @min_price ) \
                                or (Cilindrata_cm3 > @max_engsize or Cilindrata_cm3 < 0 ) \
                    )"
                )
    return df

In [35]:
df = load_usedcars_data(data_path)
df = filter_usedcars_data(df)

In [36]:
df_train, df_test = train_test_split(df, test_size=0.1)
X_train = df_train.drop(columns=['price'])
y_train = df_train['price']
X_test = df_test.drop(columns=['price'])
y_test = df_test['price']

## Building Pipeline

In [37]:
numerical_columns = ['Chilometraggio',
                    'Cilindrata_cm3',
                    'Cilindri',
                    'Consumo_comb_L100km',
                    'Marce',
                    'Peso_a_vuoto_kg',
                    'Porte',
                    'Posti',
                    'age_years',
                    'potenza_cv'
                ]

categorical_ohe_columns = ['Carburante',
                            'Carrozzeria',
                            'Tipo_di_cambio',
                            'Tipo_di_veicolo',
                            'Trazione'
                        ]
                        
categorical_fh_columns = ['maker', 'model']

In [38]:
#Numerical features pipeline
num_transformer = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
    memory = 'cache'
)

#Categorical OneHotEncoding features pipeline
cat_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore'),
    memory = 'cache'
)

#Catgeorical Hashed features pipeline
hash_transformer_maker = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='Altro'),
    FeatureHasher(n_features=2**8, input_type='string'),
    memory = 'cache'
)

#Catgeorical Hashed features pipeline
hash_transformer_model = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='Altro'),
    FeatureHasher(n_features=2**11, input_type='string'),
    memory = 'cache'
)

# Preprocessing pipeline
preprocessor = make_column_transformer(
    (num_transformer, numerical_columns),
    (cat_transformer, categorical_ohe_columns),
    (hash_transformer_maker, ['maker']),
    (hash_transformer_model, ['model']),
    remainder = 'drop'
)

In [39]:
regressor_pipe = make_pipeline(
    preprocessor,
    XGBRegressor(
        objective='reg:absoluteerror'
        ),
    memory = 'cache'
    )

global_pipe = make_pipeline(
    TransformedTargetRegressor(
        regressor=regressor_pipe,
        func=np.log,
        inverse_func=np.exp
    ),
    memory = 'cache'
)

In [40]:
global_pipe.get_params()

{'memory': 'cache',
 'steps': [('transformedtargetregressor',
   TransformedTargetRegressor(func=<ufunc 'log'>, inverse_func=<ufunc 'exp'>,
                              regressor=Pipeline(memory='cache',
                                                 steps=[('columntransformer',
                                                         ColumnTransformer(transformers=[('pipeline-1',
                                                                                          Pipeline(memory='cache',
                                                                                                   steps=[('simpleimputer',
                                                                                                           SimpleImputer(strategy='median')),
                                                                                                          ('standardscaler',
                                                                                                           

## Simple Pipeline Fit & Prediction

In [41]:
global_pipe.fit(X=X_train, y=y_train)

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(


In [None]:
y_pred_train = global_pipe.predict(X=X_train)
y_pred_test = global_pipe.predict(X=X_test)

In [None]:
print('Train metrics')
print(f'R2: {global_pipe.score(X=X_train, y=y_train)}')
print(f'MAE: {mean_absolute_error(y_train, y_pred_train)}')
print('Test metrics')
print(f'R2: {global_pipe.score(X=X_test, y=y_test)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred_test)}')

Train metrics
R2: 0.7700269268326635
MAE: 3167.1526436124627
Test metrics
R2: 0.7514510242491645
MAE: 3400.734824420519


## Cross Validation

In [42]:
param_grid = dict(
    transformedtargetregressor__regressor__xgbregressor__n_estimators=[150],
    transformedtargetregressor__regressor__xgbregressor__eta=[0.1],
    transformedtargetregressor__regressor__xgbregressor__max_depth=[6],
    transformedtargetregressor__regressor__xgbregressor__gamma=[0.3],
    transformedtargetregressor__regressor__xgbregressor__reg_lambda=[1]
)
#transformedtargetregressor__regressor__xgbregressor__

grid_search = GridSearchCV(
    global_pipe, 
    param_grid=param_grid,
    scoring=['neg_mean_absolute_error', 'r2'],
    cv=5,
    refit='neg_mean_absolute_error'
    )

In [43]:
grid_search.fit(X_train, y_train)

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS 

In [31]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(pd.DataFrame.from_dict(grid_search.cv_results_).sort_values(by='rank_test_neg_mean_absolute_error'))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_transformedtargetregressor__regressor__xgbregressor__eta,param_transformedtargetregressor__regressor__xgbregressor__gamma,param_transformedtargetregressor__regressor__xgbregressor__max_depth,param_transformedtargetregressor__regressor__xgbregressor__n_estimators,param_transformedtargetregressor__regressor__xgbregressor__reg_lambda,params,split0_test_neg_mean_absolute_error,split1_test_neg_mean_absolute_error,split2_test_neg_mean_absolute_error,split3_test_neg_mean_absolute_error,split4_test_neg_mean_absolute_error,mean_test_neg_mean_absolute_error,std_test_neg_mean_absolute_error,rank_test_neg_mean_absolute_error,split0_test_r2,split1_test_r2,split2_test_r2,split3_test_r2,split4_test_r2,mean_test_r2,std_test_r2,rank_test_r2
6,12.768128,0.094771,0.181971,0.002466,0.1,0.3,6,150,1.0,{'transformedtargetregressor__regressor__xgbre...,-2440.592189,-2337.22275,-2229.276578,-2381.495652,-2507.280992,-2379.173632,94.252989,1,0.801677,0.888676,0.892251,0.833681,0.696159,0.822489,0.071785,1
4,12.694197,0.258038,0.180798,0.003322,0.1,0.1,6,150,1.2,{'transformedtargetregressor__regressor__xgbre...,-2474.302593,-2316.941568,-2235.972534,-2390.369824,-2481.54991,-2379.827286,93.843513,2,0.806489,0.893879,0.891843,0.83257,0.674329,0.819822,0.080225,3
3,12.831336,0.273034,0.183466,0.005655,0.1,0.1,6,150,1.0,{'transformedtargetregressor__regressor__xgbre...,-2411.451745,-2362.3308,-2265.251604,-2360.942761,-2523.715931,-2384.738568,84.111574,3,0.791547,0.849321,0.889686,0.836803,0.697751,0.813022,0.065589,8
8,12.870464,0.290376,0.183389,0.003496,0.1,0.3,6,150,1.5,{'transformedtargetregressor__regressor__xgbre...,-2454.346545,-2315.30867,-2237.310719,-2413.701692,-2525.549786,-2389.243482,101.946988,4,0.788859,0.885729,0.885002,0.81895,0.686084,0.812925,0.073756,9
7,12.728187,0.088751,0.183191,0.000847,0.1,0.3,6,150,1.2,{'transformedtargetregressor__regressor__xgbre...,-2486.369463,-2333.389007,-2228.275835,-2400.428132,-2518.417999,-2393.376087,105.061432,5,0.792958,0.893177,0.902016,0.831597,0.683449,0.820639,0.079488,2
2,12.666907,0.098966,0.184476,0.004265,0.1,0.0,6,150,1.5,{'transformedtargetregressor__regressor__xgbre...,-2478.489597,-2335.897709,-2247.673239,-2320.472418,-2586.462593,-2393.799111,121.96216,6,0.788617,0.896104,0.89036,0.834849,0.681753,0.818337,0.078799,5
1,12.607214,0.11332,0.181425,0.001034,0.1,0.0,6,150,1.2,{'transformedtargetregressor__regressor__xgbre...,-2506.387692,-2306.702899,-2222.365581,-2392.313544,-2551.354546,-2395.824852,122.030158,7,0.795679,0.894893,0.899714,0.832494,0.671341,0.818824,0.083431,4
0,21.235334,2.169251,0.17105,0.026705,0.1,0.0,6,150,1.0,{'transformedtargetregressor__regressor__xgbre...,-2488.613981,-2357.727004,-2254.29345,-2366.10092,-2516.427127,-2396.632496,95.412332,8,0.783922,0.884652,0.887253,0.828055,0.692429,0.815262,0.072431,6
5,13.021984,0.711174,0.18398,0.003914,0.1,0.1,6,150,1.5,{'transformedtargetregressor__regressor__xgbre...,-2506.26281,-2333.034173,-2277.323688,-2372.125429,-2503.164557,-2398.382131,91.904858,9,0.782731,0.874368,0.888822,0.833261,0.692622,0.814361,0.071167,7


## Comet Tracking

In [23]:
load_dotenv()
COMET_API_KEY = os.environ.get('COMET_API_KEY')
COMET_PROJECT_NAME = os.environ.get('COMET_PROJECT_NAME')
COMET_WORKSPACE = os.environ.get('COMET_WORKSPACE')

In [24]:
pipeline_params = {"model_type":"xgb",
                    "transform_target":True
    }

In [None]:
for i in range(len(grid_search.cv_results_['params'])):
    exp = Experiment(api_key=COMET_API_KEY,
                        project_name=COMET_PROJECT_NAME,
                        workspace=COMET_WORKSPACE,
                        auto_param_logging=False
                    )

    exp.log_parameters(pipeline_params)

    for k,v in grid_search.cv_results_.items():
        if k == "params":
            exp.log_parameters(v[i])
        else:
            exp.log_metric(k,v[i])
    
    exp.end()
    

## Best Model Save

In [16]:
grid_search.best_params_

{'transformedtargetregressor__regressor__xgbregressor__eta': 0.1,
 'transformedtargetregressor__regressor__xgbregressor__gamma': 0.3,
 'transformedtargetregressor__regressor__xgbregressor__max_depth': 6,
 'transformedtargetregressor__regressor__xgbregressor__n_estimators': 150,
 'transformedtargetregressor__regressor__xgbregressor__reg_lambda': 1}

In [44]:
out_model_file_name = 'XGB_'+re.sub("[.,:,-, ]","_",str(datetime.now()))+'.joblib'
joblib.dump(grid_search.best_estimator_, os.path.join("models", out_model_file_name))

['models/XGB_2023-03-18_18_33_16_602109.joblib']