# Grocery Sales Prediction

### Variable Description
* Item_Identifier - Unique product ID
* Item_Weight - Weight of product
* Item_Fat_Content - Whether the product is low fat or not
* Item_Visibility - The % of total display area of all products in a store allocated to the particular product
* Item_Type - The category to which the product belongs
* Item_MRP - Maximum Retail Price (list price) of the product
* Outlet_Identifier - Unique store ID
* Outlet_Establishment_Year - The year in which store was established
* Outlet_Size - The size of the store in terms of ground area covered
* Outlet_Location_Type - The type of city in which the store is located
* Outlet_Type - Whether the outlet is just a grocery store or some sort of supermarket
* Item_Outlet_Sales - Sales of the product in the particulat store. This is the outcome variable to be predicted.

### Evaluation Metric: RMSE

In [1]:
import featuretools as ft 
import numpy as np 
import pandas as pd

In [2]:
train = pd.read_csv("Train_Grocery.txt")
test = pd.read_csv("Test_Grocery.txt")

In [3]:
train.head(10)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
5,FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088
6,FDO10,13.65,Regular,0.012741,Snack Foods,57.6588,OUT013,1987,High,Tier 3,Supermarket Type1,343.5528
7,FDP10,,Low Fat,0.12747,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636
8,FDH17,16.2,Regular,0.016687,Frozen Foods,96.9726,OUT045,2002,,Tier 2,Supermarket Type1,1076.5986
9,FDU28,19.2,Regular,0.09445,Frozen Foods,187.8214,OUT017,2007,,Tier 2,Supermarket Type1,4710.535


In [4]:
print(train["Item_Identifier"].unique().shape, test["Item_Identifier"].unique().shape)

(1559,) (1543,)


In [5]:
item_id = test['Item_Identifier']
store_id = test['Outlet_Identifier']
sales = train['Item_Outlet_Sales']

In [6]:
train.drop(["Item_Outlet_Sales"], axis=1, inplace=True)

In [7]:
data = pd.concat([train, test], axis=0, ignore_index=True)

In [8]:
data.isnull().sum()

Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [9]:
data['Item_Weight'].fillna(data['Item_Weight'].median(), inplace = True)
data['Outlet_Size'].fillna("missing", inplace = True)

In [10]:
data["Item_Identifier"].value_counts()

NCM18    10
FDD05    10
FDR27    10
DRF01    10
FDV60    10
NCR53    10
FDJ20    10
FDL46    10
FDW47    10
NCH55    10
NCQ05    10
NCI30    10
DRD24    10
FDK15    10
FDZ44    10
FDT15    10
FDB32    10
NCZ42    10
FDI56    10
DRH03    10
FDZ23    10
NCH30    10
FDF59    10
NCK05    10
FDU52    10
FDS45    10
NCV29    10
DRE13    10
DRI47    10
FDS55    10
         ..
FDE32     8
FDS16     8
FDD23     8
FDS50     8
FDW22     8
FDW04     8
FDT26     8
FDB12     8
FDU25     8
NCI43     8
FDB02     8
NCV30     8
FDY43     8
FDA51     8
FDI08     8
NCY05     8
FDN44     8
DRN11     7
FDM50     7
FDH58     7
FDL50     7
FDO33     7
NCL42     7
NCW54     7
FDR51     7
FDM10     7
FDX49     7
FDS22     7
FDM52     7
FDI46     7
Name: Item_Identifier, Length: 1559, dtype: int64

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14204 entries, 0 to 14203
Data columns (total 11 columns):
Item_Identifier              14204 non-null object
Item_Weight                  14204 non-null float64
Item_Fat_Content             14204 non-null object
Item_Visibility              14204 non-null float64
Item_Type                    14204 non-null object
Item_MRP                     14204 non-null float64
Outlet_Identifier            14204 non-null object
Outlet_Establishment_Year    14204 non-null int64
Outlet_Size                  14204 non-null object
Outlet_Location_Type         14204 non-null object
Outlet_Type                  14204 non-null object
dtypes: float64(3), int64(1), object(7)
memory usage: 1.2+ MB


In [12]:
Item_fat_content_dict = {'Low Fat': 0, 'Regular': 1, 'LF': 0, 'reg': 1, 'low fat': 0}

data['Item_Fat_Content'] = data['Item_Fat_Content'].map(Item_fat_content_dict)

In [13]:
data["id"] = data["Item_Identifier"] + data["Outlet_Identifier"]
data.drop(["Item_Identifier"], axis=1, inplace=True)

# 2. Feature Tools
## 2.1. Creating Entity Set (создаем набор сущностей)
* EntitySet - это структура, которая содержит набор датафреймов и зависимостей между ними
* Наши данные содержат информацию на двух уровнях - уровень товара и уровень магазина (уровень товара - основной)
* Featuretools имеет функционал для разделения набора данных на несколько таблиц связанных таблиц
* Мы создали новую таблицу outlet из таблицы BigMart на основе id таблицы Outlet_Identifier

In [14]:
es = ft.EntitySet(id = 'sales') # creating and entity set 'es'
 
es.entity_from_dataframe(entity_id = 'bigmart', dataframe = data, index = 'id') # adding a dataframe

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 11]
  Relationships:
    No relationships

In [15]:
es.normalize_entity(base_entity_id='bigmart', new_entity_id='outlet', index = 'Outlet_Identifier', 
additional_variables = ['Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'])

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 7]
    outlet [Rows: 10, Columns: 5]
  Relationships:
    bigmart.Outlet_Identifier -> outlet.Outlet_Identifier

In [16]:
print(es)

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 7]
    outlet [Rows: 10, Columns: 5]
  Relationships:
    bigmart.Outlet_Identifier -> outlet.Outlet_Identifier


## 2.2 Deep Feature Synthesis

In [17]:
feature_matrix, feature_names = ft.dfs(entityset=es, 
target_entity = 'bigmart', 
max_depth = 2, 
verbose = 1, 
n_jobs = 3)

Built 37 features
EntitySet scattered to 3 workers in 1 seconds
Elapsed: 00:00 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [18]:
feature_matrix.head()

Unnamed: 0_level_0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,outlet.Outlet_Establishment_Year,outlet.Outlet_Size,outlet.Outlet_Location_Type,outlet.Outlet_Type,...,outlet.MIN(bigmart.Item_Fat_Content),outlet.MIN(bigmart.Item_Visibility),outlet.MIN(bigmart.Item_MRP),outlet.MEAN(bigmart.Item_Weight),outlet.MEAN(bigmart.Item_Fat_Content),outlet.MEAN(bigmart.Item_Visibility),outlet.MEAN(bigmart.Item_MRP),outlet.COUNT(bigmart),outlet.NUM_UNIQUE(bigmart.Item_Type),outlet.MODE(bigmart.Item_Type)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DRA12OUT010,11.6,0,0.068535,Soft Drinks,143.0154,OUT010,1998,missing,Tier 3,Grocery Store,...,0,0.0,32.6558,12.72287,0.356757,0.101939,141.159742,925,16,Fruits and Vegetables
DRA12OUT013,11.6,0,0.040912,Soft Drinks,142.3154,OUT013,1987,High,Tier 3,Supermarket Type1,...,0,0.0,31.49,12.788139,0.353509,0.060242,141.128428,1553,16,Fruits and Vegetables
DRA12OUT017,11.6,0,0.041178,Soft Drinks,140.3154,OUT017,2007,missing,Tier 2,Supermarket Type1,...,0,0.0,32.09,12.78208,0.35256,0.061142,140.998931,1543,16,Snack Foods
DRA12OUT018,11.6,0,0.041113,Soft Drinks,142.0154,OUT018,2009,Medium,Tier 3,Supermarket Type2,...,0,0.0,31.89,12.803638,0.353816,0.059976,141.000899,1546,16,Fruits and Vegetables
DRA12OUT027,12.6,0,0.040748,Soft Drinks,140.0154,OUT027,1985,Medium,Tier 3,Supermarket Type3,...,0,0.0,31.29,12.6,0.353432,0.060344,141.012347,1559,16,Fruits and Vegetables


In [19]:
feature_matrix = feature_matrix.reindex(index=data['id'])
feature_matrix = feature_matrix.reset_index()

In [20]:
data["id"].head()

0    FDA15OUT049
1    DRC01OUT018
2    FDN15OUT049
3    FDX07OUT010
4    NCD19OUT013
Name: id, dtype: object

In [21]:
feature_matrix.head()

Unnamed: 0,id,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,outlet.Outlet_Establishment_Year,outlet.Outlet_Size,outlet.Outlet_Location_Type,...,outlet.MIN(bigmart.Item_Fat_Content),outlet.MIN(bigmart.Item_Visibility),outlet.MIN(bigmart.Item_MRP),outlet.MEAN(bigmart.Item_Weight),outlet.MEAN(bigmart.Item_Fat_Content),outlet.MEAN(bigmart.Item_Visibility),outlet.MEAN(bigmart.Item_MRP),outlet.COUNT(bigmart),outlet.NUM_UNIQUE(bigmart.Item_Type),outlet.MODE(bigmart.Item_Type)
0,FDA15OUT049,9.3,0,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,...,0,0.0,32.4558,12.803003,0.352903,0.059,141.163199,1550,16,Fruits and Vegetables
1,DRC01OUT018,5.92,1,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,...,0,0.0,31.89,12.803638,0.353816,0.059976,141.000899,1546,16,Fruits and Vegetables
2,FDN15OUT049,17.5,0,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,...,0,0.0,32.4558,12.803003,0.352903,0.059,141.163199,1550,16,Fruits and Vegetables
3,FDX07OUT010,19.2,1,0.0,Fruits and Vegetables,182.095,OUT010,1998,missing,Tier 3,...,0,0.0,32.6558,12.72287,0.356757,0.101939,141.159742,925,16,Fruits and Vegetables
4,NCD19OUT013,8.93,0,0.0,Household,53.8614,OUT013,1987,High,Tier 3,...,0,0.0,31.49,12.788139,0.353509,0.060242,141.128428,1553,16,Fruits and Vegetables


## 3. CatBoost Model

In [22]:
from catboost import CatBoostRegressor

In [23]:
categorical_features = np.where(feature_matrix.dtypes == 'object')[0]

for i in categorical_features:
    feature_matrix.iloc[:,i] = feature_matrix.iloc[:,i].astype('str')

In [24]:
feature_matrix.drop(['id'], axis=1, inplace=True)
train = feature_matrix[:8523]
test = feature_matrix[8523:]

In [25]:
# removing uneccesary variables
train.drop(['Outlet_Identifier'], axis=1, inplace=True)
test.drop(['Outlet_Identifier'], axis=1, inplace=True)

In [26]:
# identifying categorical features
categorical_features = np.where(train.dtypes == 'object')[0]

In [27]:
categorical_features

array([ 3,  6,  7,  8, 35])

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(train, sales, test_size=0.25, random_state=42)

In [29]:
model_cat = CatBoostRegressor(iterations=100, learning_rate=0.3, depth=6, eval_metric='RMSE', random_seed=42)

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.utils - ERROR - 
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/distributed/utils.py", line 713, in log_errors
    yield
  File "/opt/conda/lib/python3.7/site-packages/distributed/client.py", line 1223, in _close
    quiet_exceptions=(CancelledError,),
  File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 588, in with_timeout
    chain_future(future_converted, result)
  File "/opt/conda/lib/python3.7/site-packages/tornado/concurrent.py", line 166, in chain_future
    future_add_done_callback(a, copy)
  File "/opt/conda/lib/python3.7/site-packages/tornado/concurrent.py", line 262, in future_add_done_callback
    callback(future)
  File "/opt/conda/lib/python3.7/site-packages/tornado/concurrent.py", line 160, in copy
    elif a.exception() is not None:
concurrent.futures._base.CancelledError
distributed.utils - ERROR - 
Traceback 

In [30]:
model_cat.fit(X_train, y_train, cat_features=categorical_features, use_best_model=True)

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 2149.5890687	total: 99.8ms	remaining: 9.88s
1:	learn: 1729.8585871	total: 148ms	remaining: 7.24s
2:	learn: 1461.5789786	total: 188ms	remaining: 6.08s
3:	learn: 1294.8689567	total: 229ms	remaining: 5.49s
4:	learn: 1198.8568160	total: 280ms	remaining: 5.31s
5:	learn: 1146.5921927	total: 339ms	remaining: 5.31s
6:	learn: 1118.8689859	total: 400ms	remaining: 5.31s
7:	learn: 1102.4288564	total: 450ms	remaining: 5.18s
8:	learn: 1092.4493587	total: 505ms	remaining: 5.11s
9:	learn: 1085.5522509	total: 562ms	remaining: 5.06s
10:	learn: 1081.6364207	total: 620ms	remaining: 5.01s
11:	learn: 1079.3234010	total: 686ms	remaining: 5.03s
12:	learn: 1077.0471587	total: 732ms	remaining: 4.9s
13:	learn: 1076.3564973	total: 757ms	remaining: 4.65s
14:	learn: 1075.6832880	total: 798ms	remaining: 4.52s
15:	learn: 1074.7067306	total: 832ms	remaining: 4.37s
16:	learn: 1073.9721471	total: 868ms	remaining: 4.24s
17:	learn: 1072.7053397	total: 915ms	remaining: 4.17s
18:	learn: 1071.5775344	total: 934ms	r

<catboost.core.CatBoostRegressor at 0x7ff84959bcf8>

In [31]:
y_pred = model_cat.predict(X_valid)

In [32]:
rmse = np.sqrt(np.mean((y_pred - y_valid)**2))

In [33]:
print(rmse)

1069.199923162786


In [34]:
y_test = pd.DataFrame(model_cat.predict(test), columns=["submit"])

In [35]:
y_test.to_csv("submit.csv")

In [36]:
y_test

Unnamed: 0,submit
0,1522.307722
1,1408.340544
2,613.173722
3,2313.985723
4,6534.341154
5,1841.839254
6,660.348037
7,2333.031769
8,1507.630690
9,3085.014537


In [None]:
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [None]:
clf = CatBoostRegressor(iterations=2000, learning_rate=0.01, eval_metric = 'RMSE')

eval_set = [(X_train, y_train), (X_valid, y_valid)]

clf.fit(X_train, y_train, eval_set=eval_set, cat_features=categorical_features, verbose=0)

y_pred = clf.predict(X_valid)
predictions = [round(value) for value in y_pred]

results = clf.evals_result_
epochs = len(results['validation_0']['RMSE'])
x_axis = range(0, epochs)

# plot rmse
fig = plt.figure(figsize=(7,5))
plt.plot(x_axis, results['validation_0']['RMSE'], label='train')
plt.plot(x_axis, results['validation_1']['RMSE'], label='validation')
plt.legend()

plt.ylabel('RMSE')
plt.xlabel('Iterations')
plt.title('CatBoost Metrics')
plt.show()

In [54]:
from sklearn.model_selection import KFold
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from itertools import product, chain
from tqdm import tqdm

In [None]:
RANDOM_STATE = 2019

In [58]:
def metric_score(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

In [None]:
def cross_val_score(X, y, param, cat_features, n_splits=3):
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    
    score = []
    predict = None
    
    
    for tr_ind, val_ind in kf.split(X, y):
        X_train = X.iloc[tr_ind]
        y_train = y.iloc[tr_ind]
        
        X_valid = X.iloc[val_ind]
        y_valid = y.iloc[val_ind]
        
        clf = CatBoostRegressor(iterations=200,
                                learning_rate=0.3,
                                loss_function = param['loss_function'],
                                depth=param['depth'],
                                l2_leaf_reg = param['l2_leaf_reg'],
                                eval_metric = 'RMSE',
                                leaf_estimation_iterations = 10,
                                use_best_model=True,
                                logging_level='Silent'
        )
        
        clf.fit(X_train, 
                y_train,
                cat_features=cat_features,
                eval_set=(X_valid, y_valid),
                early_stopping = 10
        )
        
        y_pred = clf.predict(X_valid)
        
        metrics = metric_score(y_valid, y_pred)
        score.append(metrics)
        
    return sum(score)/n_splits

In [None]:
def CBGridSearchCV(X, y, params, cat_features, n_splits=5):
     
    ps = {'score': 0,
          'param': []
    }
    
    predict=None
    
    for prms in tqdm(list(ParameterGrid(params)), ascii=True, desc='Params Tuning:'):
                          
        score = cross_val_score(X, y, prms, cat_features, n_splits=5)

        if score>ps['score']:
            ps['score'] = score
            ps['param'] = prms
            
    print('Score: '+str(ps['score']))
    print('Params: '+str(ps['param']))
    
    return ps['param']

In [None]:
params = {
    'depth':[2, 3, 4, 6],
    'loss_function': ['RMSE'],
     'l2_leaf_reg':np.logspace(-20, -19, 3),
    
    }
    
param = CBGridSearchCV(X_train, y_train, params, categorical_features)

In [None]:
!pip install hyperopt

In [49]:
from hyperopt import hp, fmin, tpe, rand, STATUS_OK, Trials
import catboost as ct

In [None]:
RANDOM_STATE = 2019

In [50]:
# Hyperopt 
def get_params_ct(space):
    params = dict()
    params['use_best_model'] = True
    params['eval_metric']='RMSE'
    params['od_type']='Iter' #IncToDec
    params['od_pval']=10#1e-4
    params['fold_len_multiplier']=1.01
    params['border_count']=int(space['border_count'])
    params['leaf_estimation_method']='Newton'
    params['loss_function'] = 'RMSE'
    params['iterations'] = 3000
    params['learning_rate'] = np.round(space['learning_rate'],4)
    params['depth'] = int(space['depth'])
    params['l2_leaf_reg'] = space['l2_leaf_reg']
    params['bagging_temperature']=space['bagging_temperature']
    params['rsm'] = space['rsm']
    params['random_seed']=0
    params['verbose']=True
    return params

In [94]:
def objective(space, X=X_train, y=y_train, n_splits=5, cat_features=categorical_features, random_state=RANDON_STATE):

    params = get_params_ct(space)
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    score = []
    num_trees = []
    predict = None
    
    for tr_ind, val_ind in kf.split(X, y):
        X_train = X.iloc[tr_ind]
        y_train = y.iloc[tr_ind]
        
        X_valid = X.iloc[val_ind]
        y_valid = y.iloc[val_ind]
        
        cbr = ct.CatBoost(params)
        
        cbr.fit(X_train, 
                y_train,
                cat_features=cat_features,
                eval_set=(X_valid, y_valid),
                early_stopping_rounds=10,
                verbose=0
        )
        
        y_pred = cbr.predict(X_valid)
        nb_trees = cbr.tree_count_
        
        metrics = metric_score(y_valid, y_pred)
        score.append(metrics)
        num_trees.append(nb_trees)
        
    metrics_mean = np.mean(score)
    metrics_std = np.std(score)
    
    params["iterations"] = num_trees
    
    print(score)
    print(metrics_mean, metrics_std)
    print(params)
    print("###")
        
    return{'loss': metrics_mean, 'loss_variance': metrics_std, 'status': STATUS_OK,'attachments': params}

In [95]:
space = {
        'iterations': hp.quniform("iterations", 20, 200, 10), # not optimized
        'border_count':hp.quniform("border_count", 120, 200, 10),
        'depth': hp.quniform("depth", 3, 6, 1), #round(uniform(low, high) / q) * q
        'rsm': hp.uniform ('rsm', 0.75, 1.0), # returns a value uniformly between low and high
        'learning_rate': hp.loguniform('learning_rate', -4, -1), # [exp(low), exp(high)]
        'l2_leaf_reg': hp.uniform('l2_leaf_reg', 0.3, 3),
        'bagging_temperature': hp.uniform('bagging_temperature', 0, 1)
       }

In [96]:
N_HYPEROPT_PROBES = 100
HYPEROPT_ALGO = rand.suggest # tpe.suggest

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=HYPEROPT_ALGO,
            max_evals=N_HYPEROPT_PROBES,
            trials=trials,
            verbose=2)

[1089.6644826740637, 1068.042497360832, 1075.3191985757082, 1109.3603922230034, 1095.709352664216]
1087.6191846995648                                 
14.6754932583784                                   
{'use_best_model': True, 'eval_metric': 'RMSE', 'od_type': 'Iter', 'od_pval': 10, 'fold_len_multiplier': 1.01, 'border_count': 160, 'leaf_estimation_method': 'Newton', 'loss_function': 'RMSE', 'iterations': [121, 78, 50, 65, 56], 'learning_rate': 0.1084, 'depth': 6, 'l2_leaf_reg': 0.6294168454807226, 'bagging_temperature': 0.32920913763824444, 'rsm': 0.8940947875975525, 'random_seed': 0, 'verbose': True}
[1089.6133040087723, 1063.085536019896, 1076.6505331451467, 1106.940185667924, 1094.2246578425645]
1086.1028433368606                                                          
15.04409947122127                                                           
{'use_best_model': True, 'eval_metric': 'RMSE', 'od_type': 'Iter', 'od_pval': 10, 'fold_len_multiplier': 1.01, 'border_count': 140, 'lea

In [97]:
best

{'bagging_temperature': 0.7774313074175617,
 'border_count': 180.0,
 'depth': 4.0,
 'iterations': 100.0,
 'l2_leaf_reg': 1.9811843513921206,
 'learning_rate': 0.06661726603687619,
 'rsm': 0.7654599449186381}