In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import math
import os
import sys
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from ml_metrics import rmsle
from sklearn.metrics import make_scorer
from xgboost import XGBRegressor

**Считывание и предварительный анализ данных**

Semana — День недели (с четверга по среду)

Agencia_ID — идентификатор торгового центра

Canal_ID — идентификатор канала продаж

Ruta_SAK — идентификатор маршрута (несколько маршрутов = торговый склад)

Cliente_ID — идентификатор клиента

NombreCliente — имя клиента

Producto_ID — идентификатор продукта

NombreProducto — имя продукта

Venta_uni_hoy — единица продаж на этой неделе (integer)

Venta_hoy — продажи на этой неделе (единица: песо)

Dev_uni_proxima — возвращает единицу на следующей неделе (integer)

Dev_proxima — возврат на следующей неделе (unit: pesos)

Demanda_uni_equil — скорректированный спрос (integer) (target)

In [None]:
dtype_test = {'Semana': np.uint8, 
         'Agencia_ID': np.uint16, 
         'Canal_ID': np.uint8,
         'Ruta_SAK': np.uint16, 
         'Cliente_ID': np.uint32, 
         'Producto_ID': np.uint16}
df_test=pd.read_csv('/kaggle/input/grupo-bimbo-inventory-demand/test.csv.zip', usecols=dtype_test.keys(), dtype=dtype_test)
df_test.head()

In [3]:
dtype_train = {'Semana': np.uint8, 
         'Agencia_ID': np.uint16, 
         'Canal_ID': np.uint8,
         'Ruta_SAK': np.uint16, 
         'Cliente_ID': np.uint32, 
         'Producto_ID': np.uint16,
        'Demanda_uni_equil': np.uint16}
%time df_train = pd.read_csv('/kaggle/input/grupo-bimbo-inventory-demand/train.csv.zip', usecols=dtype_train.keys(), dtype=dtype_train)
df_train = df_train.rename(columns = {'Semana':'Week_nbr', 'Agencia_ID':'Mall_ID','Canal_ID':'Sales_chanal_ID','Ruta_SAK':'Rout_ID','Cliente_ID':'Client_ID','Producto_ID':'Product_ID','Demanda_uni_equil':'Adjusted_Demand'},inplace = False)
df_train.head()

Далее будем рассматривать подвыборку

In [4]:
df=df_train.sample(frac=0.0013).reset_index(drop=True)
df.shape

In [5]:
del[df_train]

In [None]:
df.nunique()

**Построение моделей**

Посмотрим как будет изменяться метрика при различных способах построения моделей

*Random forest*

In [None]:
X=df.drop('Adjusted_Demand', axis=1)
y=df.Adjusted_Demand

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
parameters = {
    'n_estimators':[400,500],
    'max_depth':[10,20,30],
    'min_samples_split':[2,3,4]
}

reg=RandomForestRegressor(n_jobs=-1)
clf = GridSearchCV(reg, parameters,scoring=make_scorer(rmsle), cv=3, verbose=1)

clf.fit(X_train, y_train)

In [None]:
print("Best parameters: ",clf.best_params_)
reg_best=RandomForestRegressor(n_estimators=clf.best_params_['n_estimators'], max_depth=clf.best_params_['max_depth'],min_samples_split=clf.best_params_['min_samples_split'])
reg_best.fit(X_train, y_train)
print("Best score: ", rmsle(reg_best.predict(X_test),y_test))

In [None]:
importances = reg_best.feature_importances_
includices = np.argsort(importances)[::-1]
ar_f=[]
for f, idx in enumerate(includices):
    ar_f.append([round(importances[idx],4), np.array(df.columns)[idx]])
ar_f

Попробуем улучшить этот показатель. Для начала разобьем колонки с Week_nbr и Sales_chanal_ID

In [6]:
df = pd.get_dummies(df, columns = ['Week_nbr', 'Sales_chanal_ID'])
df.head()

In [None]:
X=df.drop('Adjusted_Demand', axis=1)
y=df.Adjusted_Demand

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

In [None]:
reg_best2=RandomForestRegressor(n_estimators=400, max_depth=30,min_samples_split=2)
reg_best2.fit(X_train, y_train)

In [None]:
importances = reg_best2.feature_importances_
includices = np.argsort(importances)[::-1]
ar_f=[]
for f, idx in enumerate(includices):
    ar_f.append([round(importances[idx],4), np.array(df.columns)[idx]])
ar_f

In [None]:
rmsle(reg_best2.predict(X_test),y_test)#previous=0,71805

Добавим данные из смежных таблиц (рассмотрим таблицу с городами и штатами)

In [7]:
df_town = pd.read_csv('../input/grupo-bimbo-inventory-demand/town_state.csv.zip')
df_town = df_town.rename(columns={'Agencia_ID':'Mall_ID'})
df_town.head()

In [8]:
df = df.merge(df_town, how='inner', on='Mall_ID')

Оставим наиболее частые штаты

In [9]:
cnt = df.State.value_counts()
big_category = set(cnt[cnt >= 2100].index)

df['State'] = df.State.apply(lambda w: w if w in big_category else 'another')
fig = plt.figure(figsize=(8,5))
df.State.value_counts().plot(kind='barh')
plt.title("New Hist of State")
plt.show()

Далее будем рассматривать модель, которая не учитывает города (в целом, количество городов соответствует количеству тц, тогда колонкка Mall_ID отвечает за них)

In [10]:
df_without_town = df.copy()

In [11]:
del[df]

In [12]:
df_without_town = df_without_town.drop('Town', axis=1)

In [13]:
df_without_town = pd.get_dummies(df_without_town, columns = ['State'])
df_without_town.head()

In [None]:
X=df_without_town.drop('Adjusted_Demand', axis=1)
y=df_without_town.Adjusted_Demand

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

In [None]:
reg_best3=RandomForestRegressor(n_estimators=400, max_depth=30,min_samples_split=2)
reg_best3.fit(X_train, y_train)

In [None]:
importances = reg_best3.feature_importances_
includices = np.argsort(importances)[::-1]
ar_f=[]
for f, idx in enumerate(includices):
    ar_f.append([round(importances[idx],4), np.array(df_without_town.columns)[idx]])
ar_f

In [None]:
rmsle(reg_best3.predict(X_test),y_test)#previous=0.71580

Рассмотрим график предсказанных и реальных значений

In [None]:
fig = plt.figure(figsize=(10,7))
x_ax = range(len(y_test))
plt.plot(x_ax, y_test, label="original")
plt.plot(x_ax, reg_best3.predict(X_test), label="predicted")
plt.title('RandomForestRegressor(test)')
plt.legend()
plt.show()

Посмотрим распределение среднего значения целевой переменной по колонкам

In [None]:
fig, ax = plt.subplots(nrows=1, sharey=False,ncols=4, figsize=(20, 3))
arr_param = ['Mall_ID', 'Rout_ID', 'Product_ID', 'Client_ID']
for i in range(len(arr_param)):
    ax[i].hist(df_without_town.groupby(arr_param[i]).Adjusted_Demand.mean())
    ax[i].set_yscale('log')
    ax[i].set_ylabel(arr_param[i])
    
plt.show()

In [14]:
df_mall = df_without_town.groupby('Mall_ID').Adjusted_Demand.mean()
max_mall = set(df_mall[df_mall>250].index)
df_without_town['Mall_ID_max'] =df_without_town.Mall_ID.apply(lambda w: 1 if w in max_mall else 0)

df_rout = df_without_town.groupby('Rout_ID').Adjusted_Demand.mean()
max_rout = set(df_rout[df_rout>340].index)
df_without_town['Rout_ID_max'] =df_without_town.Rout_ID.apply(lambda w: 1 if w in max_rout else 0)

df_product = df_without_town.groupby('Product_ID').Adjusted_Demand.mean()
max_product = set(df_product[df_product>340].index)
df_without_town['Product_ID_max'] =df_without_town.Product_ID.apply(lambda w: 1 if w in max_product else 0)

df_client = df_without_town.groupby('Client_ID').Adjusted_Demand.mean()
max_client = set(df_client[df_client>420].index)
df_without_town['Client_ID_max'] =df_without_town.Client_ID.apply(lambda w: 1 if w in max_client else 0)

In [15]:
X=df_without_town.drop('Adjusted_Demand', axis=1)
y=df_without_town.Adjusted_Demand

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

In [16]:
reg_best4=RandomForestRegressor(n_estimators=400, max_depth=30,min_samples_split=2, random_state=45)
reg_best4.fit(X_train, y_train)

In [17]:
rmsle(reg_best4.predict(X_test),y_test)#previous=0.71082

In [None]:
fig = plt.figure(figsize=(10,7))
x_ax = range(len(y_test))
plt.plot(x_ax, y_test, label="original")
plt.plot(x_ax, reg_best4.predict(X_test), label="predicted")
plt.title('RandomForestRegressor(test)')
plt.legend()
plt.show()

Рассмотрим XGB

In [None]:
parameters = {
    'n_estimators':[400, 500],
    'max_depth':[5,10,20],
    'learning_rate': [0.01,0.05,0.1],
    'colsample_bytree': [0.3,0.5,0.6], 
    'subsample':[0.5,0.7,1.] 
    }

xgbr = XGBRegressor(objective='reg:squarederror')#, tree_method='gpu_hist', gpu_id=0) 
xgbr_grid = GridSearchCV(xgbr, parameters, scoring=make_scorer(rmsle), n_jobs=-1, cv=3, verbose=2)

#param_grid_XGB={'n_estimators':[50,100], 'max_depth':[4,8], 'learning_rate':[0.1,0.2],'colsample_bytree':[0.6,0.8]}
#gcv_XGB = GridSearchCV(XGBRegressor(), param_grid_XGB, scoring=make_scorer(rmsle), n_jobs=-1, cv=3)

xgbr_grid.fit(X_train, y_train)

In [None]:
print("Best params: ",xgbr_grid.best_params_)

In [None]:
rmsle(xgbr_grid.predict(X_test),y_test)

In [None]:
xgbr_best=XGBRegressor(n_estimators=xgbr_grid.best_params_['n_estimators'], 
                  max_depth=xgbr_grid.best_params_['max_depth'],
                  colsample_bytree=xgbr_grid.best_params_['colsample_bytree'],
                  learning_rate=xgbr_grid.best_params_['learning_rate'],
                  subsample=xgbr_grid.best_params_['subsample'])
xgbr_best.fit(X_train, y_train)

In [None]:
rmsle(xgbr_best.predict(X_test),y_test)#previous for RF=0.70148

In [None]:
df_train = df_train.query("Demanda_uni_equil != 0")

In [None]:
len(df_train)

In [None]:
y = df_train['Demanda_uni_equil']
X = df_train.drop(['Demanda_uni_equil'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
del[X,y,df_train]

**RandomForest**

In [None]:
param_grid_RF={'n_estimators':np.arange(5,155,50), 
               'max_depth':np.arange(5,105,25),
              'min_samples_leaf':[2,4]}
rfr = RandomForestRegressor()
gcv= GridSearchCV(rfr, param_grid_RF, scoring=make_scorer(rmsle), n_jobs=-1, cv=3)

In [None]:
%%time
gcv.fit(X_train, y_train)

Для 1000000

CPU times: user 5.72 s, sys: 401 ms, total: 6.12 s

Wall time: 49min 22s

Для 300000

CPU times: user 1.89 s, sys: 265 ms, total: 2.15 s

Wall time: 11min 34s

In [None]:
gcv.best_estimator_

Для 1000000

RandomForestRegressor(max_depth=5, min_samples_leaf=4, n_estimators=5)

Для 300000

RandomForestRegressor(max_depth=5, min_samples_leaf=4, n_estimators=5)


In [None]:
gcv.best_score_

Для 1000000

0.6898197009136524

Для 300000

0.7538247217151497

In [None]:
rmsle(gcv.predict(X_test),y_test)

Для 1000000

0.6873329228533199

Для 300000

0.7492675299680622

**XGB**

In [None]:
param_grid_XGB={'n_estimators':[50,100], 'max_depth':[4,8], 'learning_rate':[0.1,0.2],'colsample_bytree':[0.6,0.8]}
gcv_XGB = GridSearchCV(XGBRegressor(), param_grid_XGB, scoring=make_scorer(rmsle), n_jobs=-1, cv=3)

In [None]:
%%time
gcv_XGB.fit(X_train, y_train)

In [None]:
gcv_XGB.best_estimator_

Для 300000

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=50, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
gcv_XGB.best_score_

In [None]:
rmsle(gcv_XGB.predict(X_test),y_test)

Для 300000

0.7237856