In [60]:
import pandas as pd

In [61]:
df = pd.read_csv('cars/bmw.csv')

In [62]:
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0
1,6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0
2,5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0
3,1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5
4,7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0


# Обучающая и тестовая выборка (train and test)

In [63]:
from sklearn.model_selection import train_test_split

In [64]:
train, test = train_test_split(df, train_size=0.6, random_state=42)

# Валидационная выборка

In [65]:
len(train) / len(df)

0.5999443465355718

In [66]:
len(test) / len(df)

0.4000556534644282

In [67]:
val, test = train_test_split(test, train_size=0.5, random_state=42)

In [68]:
len(train) / len(df)

0.5999443465355718

In [69]:
len(val) / len(df)

0.19998144884519062

In [70]:
len(test) / len(df)

0.20007420461923756

# Список фичей

In [71]:
X = ['year', 'engineSize', 'transmission'] # features
cat_features = ['transmission']
y = ['price'] # target

# Принцип преобразования категориальных фичей

In [72]:
pd.get_dummies(train['transmission'])

Unnamed: 0,Automatic,Manual,Semi-Auto
5133,False,False,True
2121,True,False,False
6576,False,False,True
8330,False,True,False
3161,False,False,True
...,...,...,...
5734,False,False,True
5191,False,False,True
5390,False,False,True
860,False,False,True


# Запуск Кэтбуста

In [73]:
from catboost import CatBoostRegressor # https://github.com/catboost/catboost/issues/2671

In [74]:
model = CatBoostRegressor(cat_features=cat_features, eval_metric='MAPE', random_seed=42, verbose=100)

In [75]:
model.fit(train[X], train[y], eval_set=(val[X], val[y]))

Learning rate set to 0.068263
0:	learn: 0.4603375	test: 0.4555509	best: 0.4555509 (0)	total: 5.38ms	remaining: 5.38s
100:	learn: 0.1546393	test: 0.1529687	best: 0.1529687 (100)	total: 230ms	remaining: 2.05s
200:	learn: 0.1528213	test: 0.1517332	best: 0.1517332 (200)	total: 400ms	remaining: 1.59s
300:	learn: 0.1517532	test: 0.1514386	best: 0.1514000 (288)	total: 642ms	remaining: 1.49s
400:	learn: 0.1509722	test: 0.1515508	best: 0.1514000 (288)	total: 947ms	remaining: 1.41s
500:	learn: 0.1502967	test: 0.1516166	best: 0.1514000 (288)	total: 1.21s	remaining: 1.2s
600:	learn: 0.1499412	test: 0.1516849	best: 0.1514000 (288)	total: 1.42s	remaining: 941ms
700:	learn: 0.1494768	test: 0.1516928	best: 0.1514000 (288)	total: 1.63s	remaining: 694ms
800:	learn: 0.1490851	test: 0.1518297	best: 0.1514000 (288)	total: 1.84s	remaining: 456ms
900:	learn: 0.1487055	test: 0.1518619	best: 0.1514000 (288)	total: 2.04s	remaining: 224ms
999:	learn: 0.1484264	test: 0.1520576	best: 0.1514000 (288)	total: 2.25s	r

<catboost.core.CatBoostRegressor at 0x151095ca0>

In [76]:
model.predict(test[X])

array([45436.28801924, 21738.83525716, 16205.32627896, ...,
       28444.11570854, 16205.32627896, 27824.78614511])

In [77]:
test['price_pred'] = model.predict(test[X])

In [78]:
test

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,price_pred
4984,X6,2019,52990,Semi-Auto,3086,Diesel,145,34.9,3.0,45436.288019
4880,X1,2018,24081,Semi-Auto,13245,Diesel,150,60.1,2.0,21738.835257
9132,1 Series,2016,12999,Automatic,68949,Petrol,200,43.5,2.0,16205.326279
6451,1 Series,2020,11995,Semi-Auto,10,Petrol,150,34.5,2.0,32088.707415
7199,3 Series,2020,29875,Semi-Auto,150,Petrol,145,42.2,2.0,32088.707415
...,...,...,...,...,...,...,...,...,...,...
9627,3 Series,2015,14999,Automatic,78680,Diesel,160,52.3,3.0,19357.530409
6096,4 Series,2019,29676,Automatic,7365,Petrol,145,41.5,3.0,44808.449332
1747,5 Series,2019,30570,Automatic,3067,Hybrid,145,49.6,2.0,28444.115709
4023,3 Series,2016,14999,Automatic,70054,Hybrid,0,148.7,2.0,16205.326279


In [79]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [80]:
def error(y_true, y_pred):
    print(mean_absolute_error(y_true, y_pred))
    print(mean_absolute_percentage_error(y_true, y_pred))

In [81]:
error(test['price'], test['price_pred'])

3366.679482256595
0.15363582890850797


# Обучение на всех фичах

In [82]:
X = ['model', 'year', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize'] # features
cat_features = ['model', 'transmission', 'fuelType']
y = ['price'] # target

In [83]:
parameters = {'cat_features': cat_features, 'eval_metric': 'MAPE', 'random_seed': 42, 'verbose': 100}

In [84]:
model = CatBoostRegressor(**parameters)

In [85]:
model.fit(train[X], train[y], eval_set=(val[X], val[y]))

Learning rate set to 0.068263
0:	learn: 0.4607438	test: 0.4563750	best: 0.4563750 (0)	total: 3.11ms	remaining: 3.1s
100:	learn: 0.0904302	test: 0.0896388	best: 0.0896388 (100)	total: 309ms	remaining: 2.75s
200:	learn: 0.0781784	test: 0.0803125	best: 0.0803125 (200)	total: 646ms	remaining: 2.57s
300:	learn: 0.0729925	test: 0.0764506	best: 0.0764506 (300)	total: 964ms	remaining: 2.24s
400:	learn: 0.0695552	test: 0.0744676	best: 0.0744553 (399)	total: 1.25s	remaining: 1.87s
500:	learn: 0.0670683	test: 0.0732827	best: 0.0732827 (500)	total: 1.54s	remaining: 1.53s
600:	learn: 0.0652835	test: 0.0726170	best: 0.0726128 (597)	total: 1.82s	remaining: 1.21s
700:	learn: 0.0636327	test: 0.0720622	best: 0.0720500 (695)	total: 2.09s	remaining: 891ms
800:	learn: 0.0622957	test: 0.0715740	best: 0.0715703 (799)	total: 2.37s	remaining: 590ms
900:	learn: 0.0610900	test: 0.0711739	best: 0.0711658 (897)	total: 2.64s	remaining: 290ms
999:	learn: 0.0601515	test: 0.0707892	best: 0.0707892 (999)	total: 2.92s	r

<catboost.core.CatBoostRegressor at 0x1510b9fd0>

In [86]:
test['price_pred_all'] = model.predict(test[X])

In [87]:
error(test['price'], test['price_pred_all'])

1557.1393226999464
0.07360851115958396
