In [30]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [31]:
train_data = pd.read_csv('sales_train_complete_1.csv')
test_data = pd.read_csv('../competitive-data-science-predict-future-sales/test.csv')

In [32]:
obj_dtypes = ['date', 'item_name', 'item_category_name', 'shop_name', 'city']
for col in obj_dtypes:
    train_data[col] = train_data[col].astype('category')
train_data.dtypes


date                        category
date__month                    int64
shop_id                        int64
item_id                        int64
item_price                   float64
item_cnt_day                 float64
item_category_id               int64
item_name                   category
item_category_name          category
shop_name                   category
date__day                      int64
date__week                     int64
date__day_of_month             int64
date__day_of_week              int64
date__week_of_year             int64
date__month_of_year            int64
date__year                     int64
is_weekend                      bool
is_holiday                      bool
item_cnt_month               float64
item_cnt_week                float64
item_cnt_year                float64
dominant_category_vector      object
city                        category
lag_0                        float64
lag_1                        float64
lag_2                        float64
l

In [33]:
test_data['item_cnt_month'] = [0 for i in range(len(test_data.values))]

In [34]:
train_data = train_data.drop(['dominant_category_vector', 'shop_opening_date'], axis = 1)

In [35]:
X_train = train_data.drop(columns=['item_cnt_month'])
y_train = train_data['item_cnt_month']

In [36]:
missing_columns = set(X_train.columns) - set(test_data.columns)
print("Признаки, которых не хватает в тесте:", missing_columns)

Признаки, которых не хватает в тесте: {'lag_8', 'date__day', 'lag_7', 'lag_5', 'lag_3', 'lag_10', 'date__month', 'date__day_of_month', 'item_category_name', 'date__month_of_year', 'item_name', 'date__week', 'lag_4', 'avg_monthly_sales', 'item_cnt_year', 'is_weekend', 'lag_13', 'lag_15', 'lag_11', 'shop_name', 'item_cnt_day', 'store_age', 'city', 'lag_2', 'lag_6', 'item_category_id', 'lag_9', 'is_holiday', 'lag_0', 'item_cnt_week', 'date__week_of_year', 'date', 'date__year', 'date__day_of_week', 'item_price', 'lag_14', 'lag_12', 'lag_1'}


In [37]:
X_test = test_data.copy()

In [38]:
train_data.dtypes

date                   category
date__month               int64
shop_id                   int64
item_id                   int64
item_price              float64
item_cnt_day            float64
item_category_id          int64
item_name              category
item_category_name     category
shop_name              category
date__day                 int64
date__week                int64
date__day_of_month        int64
date__day_of_week         int64
date__week_of_year        int64
date__month_of_year       int64
date__year                int64
is_weekend                 bool
is_holiday                 bool
item_cnt_month          float64
item_cnt_week           float64
item_cnt_year           float64
city                   category
lag_0                   float64
lag_1                   float64
lag_2                   float64
lag_3                   float64
lag_4                   float64
lag_5                   float64
lag_6                   float64
lag_7                   float64
lag_8   

In [39]:
train_data['avg_monthly_sales'] = train_data['avg_monthly_sales'].replace([np.inf, -np.inf], np.nan)
train_data.describe()


Unnamed: 0,date__month,shop_id,item_id,item_price,item_cnt_day,item_category_id,date__day,date__week,date__day_of_month,date__day_of_week,...,lag_8,lag_9,lag_10,lag_11,lag_12,lag_13,lag_14,lag_15,store_age,avg_monthly_sales
count,1066138.0,1066138.0,1066138.0,1066138.0,1066138.0,1066138.0,1066138.0,1066138.0,1066138.0,1066138.0,...,592846.0,561594.0,533295.0,507618.0,484194.0,462767.0,442996.0,424696.0,1066138.0,1065228.0
mean,19.37746,30.37581,9373.19,919.3812,1.138909,24.21833,603.7005,85.79101,16.09792,3.330432,...,2.634539,2.693033,2.749514,2.803707,2.855895,2.905663,2.95374,2.999635,1.594947,5.965159
std,9.16033,15.55579,6153.019,894.3454,0.4788792,13.26788,278.788,39.82778,8.913809,2.004361,...,2.633659,2.67691,2.718178,2.757564,2.794874,2.829872,2.863413,2.895435,0.7736602,64.78286
min,0.0,2.0,30.0,25.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.02944498
25%,12.0,19.0,4052.0,299.0,1.0,15.0,383.0,54.0,8.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9863014,0.2892889
50%,21.0,31.0,7790.0,599.0,1.0,24.0,653.0,93.0,16.0,4.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.69589,0.8155889
75%,27.0,44.0,15031.0,1199.0,1.0,34.0,830.0,118.0,24.0,5.0,...,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,2.224658,2.534722
max,33.0,59.0,22167.0,5990.0,5.0,55.0,1033.0,147.0,31.0,6.0,...,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,2.830137,14143.75


In [40]:
train_data['avg_monthly_sales'].min()

0.0294449822523394

In [41]:
# train_data = train_data.drop(['inf'], axis = 1)

In [42]:
for col in list(train_data.columns):
    print(col)
    train_data[f'{col}'] = train_data[f'{col}'].replace([np.inf, -np.inf], np.nan)


date
date__month
shop_id
item_id
item_price
item_cnt_day
item_category_id
item_name
item_category_name
shop_name
date__day
date__week
date__day_of_month
date__day_of_week
date__week_of_year
date__month_of_year
date__year
is_weekend
is_holiday
item_cnt_month
item_cnt_week
item_cnt_year
city
lag_0
lag_1
lag_2
lag_3
lag_4
lag_5
lag_6
lag_7
lag_8
lag_9
lag_10
lag_11
lag_12
lag_13
lag_14
lag_15
store_age
avg_monthly_sales


In [43]:
model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='rmse',
    early_stopping_rounds=10,
    enable_categorical = True,
)


In [44]:

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)


In [45]:
model.fit(
    X_train_split, y_train_split,
    eval_set=[(X_val_split, y_val_split)],
    verbose=True
)


XGBoostError: [15:01:01] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\common\../data/gradient_index.h:94: Check failed: valid: Input data contains `inf` or a value too large, while `missing` is not set to `inf`

In [None]:

y_pred = model.predict(X_val_split)
rmse = mean_squared_error(y_val_split, y_pred, squared=False)
print("RMSE:", rmse)


RMSE: 2.7083184959336823




In [44]:
# test_data['item_cnt_month'] = model.predict(X_test)
