In [308]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from time import time
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from xgboost import plot_importance
from lightgbm import LGBMRegressor

from sklearn.model_selection import KFold, cross_val_score
import plotly.express as px
import shap

In [636]:
#load the dataset
df = pd.read_csv('USA_cars_datasets.csv')



#drop vin dupliactes
aux = df['vin'].value_counts()
df = df[ df['vin'].apply( lambda x : x in  list(aux[aux==1].index)  )].sort_values('vin')

# Transform condition in a numeric feature
df['# condition'] =df.condition.apply(lambda x : int(x.split()[0] if x != 'Listing Expired' else -1 ) )
df['mesure unit condition'] =df.condition.apply(lambda x : x.split()[1] ) 

df['condition hours'] = -1
df['condition hours'][df['mesure unit condition'] =='minutes'] =  \
    df['# condition'][df['mesure unit condition'] =='minutes'] /60

df['condition hours'][df['mesure unit condition'] =='hours'] =  \
    df['# condition'][df['mesure unit condition'] =='hours'] 

df['condition hours'][df['mesure unit condition'] =='days'] =  \
    df['# condition'][df['mesure unit condition'] =='days'] *24


#remove record with a price less  100$ 
df = df[df.price >100]


#define X and y

X = df[['brand', 'model', 'year', 'title_status',
       'mileage', 'color', 'state',  'condition hours']] 
y = df['price']



num_feat = X.select_dtypes(include=np.number).columns
cat_feat = X.select_dtypes(include=['object']).columns


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [637]:

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()) #('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('one_hot', OneHotEncoder(handle_unknown='ignore', sparse = False))
])



preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_feat),
        ('cat', categorical_transformer, cat_feat)
    ])

In [638]:
results = pd.DataFrame(columns=['Name', 'Scores', 'StdDev', 'Time(s)'])

kfold = KFold(n_splits=10)

for model in [
    DummyRegressor,
    LinearRegression, 
    KNeighborsRegressor,
    DecisionTreeRegressor,
    RandomForestRegressor, 
    GradientBoostingRegressor,
    XGBRegressor,
    LGBMRegressor]:
    
    pipe = make_pipeline(preprocessor, model())
    start_time = time()
    
    scores = cross_val_score(pipe, X_train, 
                             y_train, scoring='r2', cv=kfold)
    time_mod = time() - start_time
    results = results.append({
        'Name' : model.__name__, 
        'Scores' : round(scores.mean(), 2), 
        'StdDev' : round(scores.std(), 2), 
        'Time(s)': round(time_mod, 2)
    }, ignore_index=True)
    del pipe
    print('Analyzed {}.'.format(model.__name__))
print('Done!')
results = results.sort_values('Scores', ascending=False)

Analyzed DummyRegressor.
Analyzed LinearRegression.
Analyzed KNeighborsRegressor.
Analyzed DecisionTreeRegressor.
Analyzed RandomForestRegressor.
Analyzed GradientBoostingRegressor.
Analyzed XGBRegressor.
Analyzed LGBMRegressor.
Done!


In [639]:
results

Unnamed: 0,Name,Scores,StdDev,Time(s)
6,XGBRegressor,0.69,0.04,6.62
4,RandomForestRegressor,0.67,0.05,15.88
7,LGBMRegressor,0.63,0.06,0.88
5,GradientBoostingRegressor,0.61,0.06,5.03
2,KNeighborsRegressor,0.59,0.05,0.6
3,DecisionTreeRegressor,0.41,0.13,0.64
0,DummyRegressor,-0.0,0.0,0.25
1,LinearRegression,-3.856534e+20,5.497865e+20,1.24


The XGBRegressor is the best model. Let go deeper

In [640]:
pipe_xgboost = make_pipeline(preprocessor, XGBRegressor()) 

parameters = {
       "xgbregressor__max_depth":[4,6,8] , 
       'xgbregressor__subsample' : [0.5, 0.75, 1],    
}

grid = GridSearchCV(pipe_xgboost, parameters)
grid.fit(X_train, y_train)

beast_model = grid.best_estimator_
beast_model.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['year', 'mileage', 'condition hours'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('one_hot',
                          

In [641]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgbregressor__max_depth,param_xgbregressor__subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.561721,0.045837,0.010144,0.000512,4,0.5,"{'xgbregressor__max_depth': 4, 'xgbregressor__...",0.687665,0.653408,0.625249,0.638368,0.713736,0.663685,0.03257,7
1,0.517721,0.009095,0.009549,0.000168,4,0.75,"{'xgbregressor__max_depth': 4, 'xgbregressor__...",0.671144,0.707113,0.627503,0.676359,0.742621,0.684948,0.038423,2
2,0.496851,0.012108,0.009537,0.000111,4,1.0,"{'xgbregressor__max_depth': 4, 'xgbregressor__...",0.69279,0.668275,0.648914,0.670272,0.729458,0.681942,0.027529,4
3,0.639763,0.005312,0.010015,0.000351,6,0.5,"{'xgbregressor__max_depth': 6, 'xgbregressor__...",0.642054,0.679078,0.59572,0.668899,0.715123,0.660175,0.039838,8
4,0.675222,0.021473,0.011149,0.001731,6,0.75,"{'xgbregressor__max_depth': 6, 'xgbregressor__...",0.654933,0.685533,0.652889,0.658947,0.754595,0.681379,0.03845,5
5,0.72809,0.044096,0.017774,0.011652,6,1.0,"{'xgbregressor__max_depth': 6, 'xgbregressor__...",0.666232,0.694586,0.643188,0.659061,0.754239,0.683461,0.039109,3
6,1.041994,0.115109,0.013121,0.000613,8,0.5,"{'xgbregressor__max_depth': 8, 'xgbregressor__...",0.658007,0.665936,0.564388,0.648609,0.699177,0.647223,0.044787,9
7,1.287665,0.117618,0.015848,0.003213,8,0.75,"{'xgbregressor__max_depth': 8, 'xgbregressor__...",0.673045,0.687511,0.635523,0.672823,0.75877,0.685534,0.040462,1
8,1.101004,0.212337,0.013994,0.00253,8,1.0,"{'xgbregressor__max_depth': 8, 'xgbregressor__...",0.66774,0.687605,0.626857,0.679933,0.739889,0.680405,0.036382,6


The best model is the one with max depth = 8 and subsaple = 0.75. Other parameters could be explored further, but for now let's stop here. 

In [642]:
y_pred = beast_model.predict(X_test)

In [643]:
px.scatter(x = y_test, y = y_pred, color = abs(y_test-y_pred)/y_pred,
           title = 'Actual price vs Predicted Price', 
           labels ={'x': 'Actual', 'y': 'Predict', 'color': 'relative absolute error'})




In [644]:
px.histogram(y_test-y_pred, title = 'Error distribution (Actual- prediction)')

In [645]:
print( f' Average Error = y_pred -y_actual =  {np.mean(y_pred-y_test)} +/- {np.sqrt(  np.var(y_pred-y_test)/len(y_test) )}' )


 Average Error = y_pred -y_actual =  -83.87768760335788 +/- 290.1598914825135


In [646]:
pd.DataFrame(y_pred-y_test).describe()

Unnamed: 0,price
count,486.0
mean,-83.877688
std,6403.284235
min,-45473.049805
25%,-1891.473145
50%,81.978516
75%,2389.421875
max,26426.605469


The average error is close to 0. And 50% of records have an error between -2000 dollars and +2000 dollars.

# Feature Importance

In [654]:
ohe = beast_model['columntransformer'].transformers[1][1]['one_hot']

X_train_encoded = beast_model['columntransformer'].transformers[1][1]['imputer'].fit_transform(X_train[cat_feat]  )
X_train_encoded = pd.DataFrame(X_train_encoded.T , cat_feat).T
X_train_encoded = ohe .fit_transform(X_train_encoded)

feature_name = np.append( num_feat, list(ohe .get_feature_names_out() ))



feature_importance = pd.DataFrame( [feature_name,beast_model['xgbregressor'].feature_importances_]).T

feature_importance.columns = ['Feature', 'importance']
feature_importance =feature_importance.sort_values('importance', ascending = False)

px.bar(feature_importance, x= 'Feature', y = 'importance')


Due to the high numerosity of classes in the categorical variables the plot of the feature importance is not very legible...

# Explainer

In [655]:
explainer = shap.TreeExplainer(beast_model['xgbregressor'])
X_test_encoded = beast_model['columntransformer'].transform(X_test)

X_test_encoded = pd.DataFrame(beast_model['columntransformer'].transform(X_test).T , feature_name  ).T


shap_values = explainer.shap_values(X_test_encoded )

ntree_limit is deprecated, use `iteration_range` or model slicing instead.


In [656]:
index = 4
shap.force_plot(explainer.expected_value, shap_values[index,:], X_test_encoded.iloc[index])

In [657]:
print(X_test.iloc[index] )
print( f'Actual price = {y_test.iloc[index]}')
print( f'Predicted price = {y_pred[index]}')

brand                       ford
model                       door
year                        2016
title_status       clean vehicle
mileage                  84035.0
color                      white
state                      texas
condition hours             48.0
Name: 259, dtype: object
Actual price = 13270
Predicted price = 13350.548828125
