In [1]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold

import eli5
from eli5.sklearn import PermutationImportance



In [2]:
ls ../data/

[0m[01;32mcar.h5[0m*


In [3]:
df = pd.read_hdf('../data/car.h5')

## Dummy Model

In [4]:
df.select_dtypes(np.number).columns

Index(['price_value', 'car_id'], dtype='object')

In [5]:
feats = ['car_id']
X = df[ feats ].values
y = df['price_value'].values

model = DummyRegressor()
model.fit(X,y)
y_pred = model.predict(X)

mae(y, y_pred)

39465.934630440985

In [6]:
[x for x in df.columns if 'price' in x]

['price_currency', 'price_details', 'price_value']

In [7]:
df['price_currency'].value_counts()

PLN    106290
EUR       204
Name: price_currency, dtype: int64

In [8]:
df = df[df['price_currency'] != 'EUR']

In [9]:
df.shape

(106290, 155)

## Features factorization

In [10]:
SUFFIX_CAT = '__cat'
for feat in df.columns:
    if isinstance(df[feat][0],list): continue
        
    factorized_values = df[feat].factorize()[0]
    if SUFFIX_CAT in feat:
        df[feat] = factorized_values
    else:
        df[feat + SUFFIX_CAT] = factorized_values
            

In [17]:
cat_feats = [ x for x in df.columns if SUFFIX_CAT in x]
len(cat_feats)

154

In [19]:
cat_feats = [ x for x in cat_feats if 'price' not in x]
len(cat_feats)

151

In [15]:
def run_model(model,feats):
    X = df[feats].values
    y = df['price_value'].values

    #model = DecisionTreeRegressor(max_depth=5)
    scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
    return np.mean(scores), np.std(scores)

In [20]:
run_model(DecisionTreeRegressor(max_depth=5), cat_feats)

(-19566.588937368324, 90.6181486516617)

## Random Forest

In [21]:
model = RandomForestRegressor(max_depth=5, n_estimators=50, random_state=0)
run_model(model, cat_feats)

(-18734.2072708522, 109.87074106274046)

## XGBoost

In [23]:
xgb_params = {
    'max_depth': 5,
    'n_estimators': 50,
    'learning_rate': 0.1,
    'seed': 0,
}

model = xgb.XGXGBoostXGBoostXGBoostXGBoostBRegressor(**xgb_params)
run_model(model, cat_feats)

(-13039.290196724838, 109.36715375706265)

## PermutationImportance by XGBoost

In [24]:
m = xgb.XGBRegressor(**xgb_params)
m.fit(X, y)

imp = PermutationImportance(m, random_state=0).fit(X,y)
eli5.show_weights(imp, feature_names=cat_feats)

Weight,Feature
0.1209  ± 0.0019,param_napęd__cat
0.1175  ± 0.0030,param_rok-produkcji__cat
0.1113  ± 0.0013,param_stan__cat
0.0625  ± 0.0019,param_skrzynia-biegów__cat
0.0527  ± 0.0016,param_faktura-vat__cat
0.0461  ± 0.0015,param_moc__cat
0.0275  ± 0.0008,param_marka-pojazdu__cat
0.0230  ± 0.0004,param_typ__cat
0.0227  ± 0.0007,feature_kamera-cofania__cat
0.0191  ± 0.0007,param_pojemność-skokowa__cat


In [53]:
s = """
0.1209 ± 0.0019 	param_napęd__cat
0.1175 ± 0.0030 	param_rok-produkcji__cat
0.1113 ± 0.0013 	param_stan__cat
0.0625 ± 0.0019 	param_skrzynia-biegów__cat
0.0527 ± 0.0016 	param_faktura-vat__cat
0.0461 ± 0.0015 	param_moc__cat
0.0275 ± 0.0008 	param_marka-pojazdu__cat
0.0230 ± 0.0004 	param_typ__cat
0.0227 ± 0.0007 	feature_kamera-cofania__cat
0.0191 ± 0.0007 	param_pojemność-skokowa__cat
0.0150 ± 0.0009 	seller_name__cat
0.0141 ± 0.0008 	param_kod-silnika__cat
0.0130 ± 0.0002 	param_model-pojazdu__cat
0.0119 ± 0.0004 	feature_wspomaganie-kierownicy__cat
0.0109 ± 0.0003 	param_wersja__cat
0.0095 ± 0.0007 	feature_czujniki-parkowania-przednie__cat
0.0089 ± 0.0003 	feature_asystent-pasa-ruchu__cat
0.0082 ± 0.0002 	feature_regulowane-zawieszenie__cat
0.0081 ± 0.0004 	feature_system-start-stop__cat
0.0072 ± 0.0004 	feature_światła-led__cat 
"""

In [59]:

s = s.replace('\n',' ')
s = s.replace('\t',' ')
dirty_feats = s.split(" ")
feats = dirty_feats[5::5]

In [60]:
feats



['param_napęd__cat',
 'param_rok-produkcji__cat',
 'param_stan__cat',
 'param_skrzynia-biegów__cat',
 'param_faktura-vat__cat',
 'param_moc__cat',
 'param_marka-pojazdu__cat',
 'param_typ__cat',
 'feature_kamera-cofania__cat',
 'param_pojemność-skokowa__cat',
 'seller_name__cat',
 'param_kod-silnika__cat',
 'param_model-pojazdu__cat',
 'feature_wspomaganie-kierownicy__cat',
 'param_wersja__cat',
 'feature_czujniki-parkowania-przednie__cat',
 'feature_asystent-pasa-ruchu__cat',
 'feature_regulowane-zawieszenie__cat',
 'feature_system-start-stop__cat',
 'feature_światła-led__cat']

In [61]:
len(feats)

20

In [62]:
run_model(xgb.XGBRegressor(**xgb_params),feats)

(-13240.835942843716, 95.7039217631258)

In [63]:
# wynik pogorszyl sie nieznacznie

In [64]:
df['param_napęd'].unique()

array([None, 'Na przednie koła', '4x4 (dołączany automatycznie)',
       'Na tylne koła', '4x4 (dołączany ręcznie)', '4x4 (stały)'],
      dtype=object)

In [65]:
df['param_rok-produkcji'].unique()

array(['2018', '2011', '2015', '2009', '2017', '2012', '2013', '2007',
       '2001', '2016', '2006', '2008', '2004', '1999', '2000', '2010',
       '2005', '2002', '1998', '2014', '2003', '1982', '1995', '1997',
       '1992', '1993', '1994', '1996', '1989', '1988', '1967', '1987',
       '1959', '1990', '1991', '1974', None, '1975', '1973', '1985',
       '1984', '1986', '1981', '1979', '1960', '1983', '1978', '1964',
       '1980', '1972', '1969', '1956', '1966', '1977', '1971', '1963',
       '1953', '1961', '1952', '1949', '1976', '1965', '1937', '1968',
       '1958', '1962', '1955', '1970', '1933', '1929', '1957', '1944',
       '1954', '1932', '1936', '1947', '1948'], dtype=object)

In [67]:
df['param_rok-produkcji__cat'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, -1, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
       50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
       67, 68, 69, 70, 71, 72, 73, 74, 75])

In [68]:
# rok tracony w wyniku faktoryzacji

In [71]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x) == 'None' else int(x))
df['param_rok-produkcji'].unique()

array([2018, 2011, 2015, 2009, 2017, 2012, 2013, 2007, 2001, 2016, 2006,
       2008, 2004, 1999, 2000, 2010, 2005, 2002, 1998, 2014, 2003, 1982,
       1995, 1997, 1992, 1993, 1994, 1996, 1989, 1988, 1967, 1987, 1959,
       1990, 1991, 1974,   -1, 1975, 1973, 1985, 1984, 1986, 1981, 1979,
       1960, 1983, 1978, 1964, 1980, 1972, 1969, 1956, 1966, 1977, 1971,
       1963, 1953, 1961, 1952, 1949, 1976, 1965, 1937, 1968, 1958, 1962,
       1955, 1970, 1933, 1929, 1957, 1944, 1954, 1932, 1936, 1947, 1948])

In [72]:
feats

['param_napęd__cat',
 'param_rok-produkcji__cat',
 'param_stan__cat',
 'param_skrzynia-biegów__cat',
 'param_faktura-vat__cat',
 'param_moc__cat',
 'param_marka-pojazdu__cat',
 'param_typ__cat',
 'feature_kamera-cofania__cat',
 'param_pojemność-skokowa__cat',
 'seller_name__cat',
 'param_kod-silnika__cat',
 'param_model-pojazdu__cat',
 'feature_wspomaganie-kierownicy__cat',
 'param_wersja__cat',
 'feature_czujniki-parkowania-przednie__cat',
 'feature_asystent-pasa-ruchu__cat',
 'feature_regulowane-zawieszenie__cat',
 'feature_system-start-stop__cat',
 'feature_światła-led__cat']

In [75]:
index=feats.index('param_rok-produkcji__cat')

In [76]:
feats[index]='param_rok-produkcji'

In [77]:
feats

['param_napęd__cat',
 'param_rok-produkcji',
 'param_stan__cat',
 'param_skrzynia-biegów__cat',
 'param_faktura-vat__cat',
 'param_moc__cat',
 'param_marka-pojazdu__cat',
 'param_typ__cat',
 'feature_kamera-cofania__cat',
 'param_pojemność-skokowa__cat',
 'seller_name__cat',
 'param_kod-silnika__cat',
 'param_model-pojazdu__cat',
 'feature_wspomaganie-kierownicy__cat',
 'param_wersja__cat',
 'feature_czujniki-parkowania-przednie__cat',
 'feature_asystent-pasa-ruchu__cat',
 'feature_regulowane-zawieszenie__cat',
 'feature_system-start-stop__cat',
 'feature_światła-led__cat']

In [80]:
run_model(xgb.XGBRegressor(**xgb_params),feats)

(-11197.83713694348, 98.22041147876314)

In [81]:
df['param_moc'].unique()


array(['90 KM', '115 KM', '262 KM', '110 KM', '310 KM', '105 KM',
       '140 KM', '175 KM', '125 KM', '185 KM', '190 KM', '440 KM',
       '141 KM', '200 KM', '224 KM', '75 KM', '99 KM', '184 KM', '109 KM',
       '233 KM', '116 KM', '68 KM', '286 KM', '126 KM', '160 KM',
       '135 KM', '120 KM', '272 KM', None, '150 KM', '180 KM', '136 KM',
       '102 KM', '131 KM', '218 KM', '245 KM', '170 KM', '112 KM',
       '250 KM', '252 KM', '73 KM', '100 KM', '313 KM', '101 KM',
       '285 KM', '70 KM', '383 KM', '174 KM', '277 KM', '132 KM',
       '130 KM', '215 KM', '60 KM', '330 KM', '163 KM', '177 KM', '98 KM',
       '78 KM', '189 KM', '156 KM', '143 KM', '69 KM', '113 KM', '65 KM',
       '122 KM', '82 KM', '251 KM', '95 KM', '197 KM', '235 KM', '238 KM',
       '171 KM', '381 KM', '400 KM', '178 KM', '80 KM', '165 KM', '85 KM',
       '258 KM', '142 KM', '204 KM', '124 KM', '55 KM', '144 KM',
       '231 KM', '248 KM', '152 KM', '181 KM', '210 KM', '340 KM',
       '129 KM', '147 

In [91]:
df['param_moc']= df['param_moc'].map(lambda x: -1 if str(x) == 'None' else int(x.split(' ')[0])  )


AttributeError: 'int' object has no attribute 'split'

0         2018
2         2011
3         2015
4         2009
5         2017
          ... 
160609    2013
160610    2013
160611    2006
160614    2003
160615    2006
Name: param_rok-produkcji, Length: 106290, dtype: int64

In [92]:
index = feats.index('param_moc__cat')
index

5

In [93]:
feats[5] = 'param_moc'

In [94]:
df['param_moc'].unique()

array([ 90, 115, 262, 110, 310, 105, 140, 175, 125, 185, 190, 440, 141,
       200, 224,  75,  99, 184, 109, 233, 116,  68, 286, 126, 160, 135,
       120, 272,  -1, 150, 180, 136, 102, 131, 218, 245, 170, 112, 250,
       252,  73, 100, 313, 101, 285,  70, 383, 174, 277, 132, 130, 215,
        60, 330, 163, 177,  98,  78, 189, 156, 143,  69, 113,  65, 122,
        82, 251,  95, 197, 235, 238, 171, 381, 400, 178,  80, 165,  85,
       258, 142, 204, 124,  55, 144, 231, 248, 152, 181, 210, 340, 129,
       147,  50,  54, 290, 306, 193,  77, 164,  96, 194, 111, 166, 206,
       118, 360, 211, 271, 455, 280, 106, 114, 421,  74, 213, 121, 275,
       435, 384, 326,  88, 220, 260,  64,  86, 128, 256, 240, 244, 162,
       237, 350,  35, 265, 202, 133,  83, 117, 146,  92, 192, 145, 525,
       254, 182, 328, 367, 148, 456,  97, 270, 107, 108, 203, 155,  94,
        93, 241,  20,  71, 173,  58, 205, 236,   1, 557,  84, 457,  72,
       295, 134, 425, 228,  81, 230, 201,  87, 234, 299, 585, 20

In [95]:
feats

['param_napęd__cat',
 'param_rok-produkcji',
 'param_stan__cat',
 'param_skrzynia-biegów__cat',
 'param_faktura-vat__cat',
 'param_moc',
 'param_marka-pojazdu__cat',
 'param_typ__cat',
 'feature_kamera-cofania__cat',
 'param_pojemność-skokowa__cat',
 'seller_name__cat',
 'param_kod-silnika__cat',
 'param_model-pojazdu__cat',
 'feature_wspomaganie-kierownicy__cat',
 'param_wersja__cat',
 'feature_czujniki-parkowania-przednie__cat',
 'feature_asystent-pasa-ruchu__cat',
 'feature_regulowane-zawieszenie__cat',
 'feature_system-start-stop__cat',
 'feature_światła-led__cat']

In [96]:
run_model(xgb.XGBRegressor(**xgb_params),feats)

(-9602.94111071797, 57.96672683246094)

In [103]:
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x) == 'None' else x.split('cm')[0].replace(' ',''))

In [106]:
index = feats.index('param_pojemność-skokowa__cat')
feats[index] = 'param_pojemność-skokowa'

In [107]:
run_model(xgb.XGBRegressor(**xgb_params),feats)

(-9449.513980284812, 81.47168211987172)