# Прогнозирование продаж. Часть 3. Построение модели

In [1]:
%%capture
!pip install catboost

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_string_dtype

In [4]:
df = pd.read_csv('/content/drive/MyDrive/dataset/dataset_result.csv')

In [5]:
df.head(15)

Unnamed: 0,customerid,country,total_amount,avg_check,top_product_1,top_product_2,top_product_3,top_product_4,top_product_5,top_product_6,top_product_7,2010_12,2011_1,2011_10,2011_11,2011_12,2011_2,2011_3,2011_4,2011_5,2011_6,2011_7,2011_8,2011_9
0,17850,United Kingdom,5391.21,158.57,22803,21730,71053,84029G,84029E,82486,22752,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13047,United Kingdom,3237.54,323.75,84969,22960,22720,48187,47566,21755,22722,3.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
2,12583,France,7281.38,485.43,POST,22726,22727,22728,22326,22492,22556,1.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,2.0,1.0,1.0,2.0
3,13748,United Kingdom,948.25,189.65,22086,22084,22423,22950,22734,22839,23354,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0
4,15100,United Kingdom,876.0,292.0,21258,-,-,-,-,-,-,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,15291,United Kingdom,4668.3,311.22,82486,22212,21733,22112,48188,21754,22241,2.0,2.0,0.0,3.0,0.0,0.0,3.0,2.0,0.0,2.0,0.0,0.0,1.0
6,14688,United Kingdom,5630.87,268.14,22379,21770,22381,21934,22138,21935,79321,1.0,2.0,2.0,0.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0
7,17809,United Kingdom,5411.91,450.99,23284,22624,22943,23312,84347,21523,48138,3.0,0.0,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0
8,15311,United Kingdom,60767.9,667.78,85099B,22411,22386,21931,20712,85099C,21137,9.0,10.0,8.0,10.0,3.0,7.0,5.0,6.0,6.0,6.0,8.0,6.0,7.0
9,16098,United Kingdom,2005.63,286.52,22926,23094,22411,22782,22726,22381,22783,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0


In [6]:
y = df['total_amount']

In [7]:
X = df.drop(columns=['customerid','total_amount'],axis=1)

In [8]:
X_col_is_string= [_ for _ in X.columns if is_string_dtype(X[_])]

In [9]:
X_col_is_string

['country',
 'top_product_1',
 'top_product_2',
 'top_product_3',
 'top_product_4',
 'top_product_5',
 'top_product_6',
 'top_product_7']

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3,  random_state=42)

In [11]:
train_pool = Pool(x_train, 
                  y_train, 
                  cat_features=X_col_is_string)
test_pool = Pool(x_test, 
                 cat_features=X_col_is_string) 

In [12]:
model = CatBoostRegressor(verbose=100)

In [13]:
model.fit(train_pool)

Learning rate set to 0.047564
0:	learn: 10061.9343120	total: 56.7ms	remaining: 56.7s
100:	learn: 2892.5485521	total: 938ms	remaining: 8.35s
200:	learn: 1305.9739161	total: 1.78s	remaining: 7.09s
300:	learn: 760.1677911	total: 2.58s	remaining: 5.99s
400:	learn: 479.2561425	total: 3.43s	remaining: 5.13s
500:	learn: 333.0066175	total: 4.34s	remaining: 4.32s
600:	learn: 247.7674820	total: 5.26s	remaining: 3.49s
700:	learn: 198.3606728	total: 6.2s	remaining: 2.64s
800:	learn: 170.8018623	total: 7.08s	remaining: 1.76s
900:	learn: 145.5483265	total: 7.98s	remaining: 877ms
999:	learn: 128.0116497	total: 8.86s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7fbcbcbc05d0>

In [14]:
preds = model.predict(test_pool)

In [15]:
print("MSE %.3f" % mean_squared_error(preds, y_test.values))
print("MAE %.3f" % mean_absolute_error(preds, y_test.values))

MSE 3051177.044
MAE 277.065
