In [1]:
# General imports
import itertools
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random
import gc

from math import ceil

import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

### Loading Data

In [2]:
# Now we have 3 sets of features
grid_df = pd.concat([pd.read_pickle('data/grid_part_1.pkl'),
                     pd.read_pickle('data/grid_part_2.pkl').iloc[:,2:],
                     pd.read_pickle('data/grid_part_3.pkl').iloc[:,2:]],
                     axis=1)

In [3]:
grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46881677 entries, 0 to 46881676
Data columns (total 34 columns):
 #   Column            Dtype   
---  ------            -----   
 0   id                category
 1   item_id           category
 2   dept_id           category
 3   cat_id            category
 4   store_id          category
 5   state_id          category
 6   d                 int16   
 7   sales             float64 
 8   release           int16   
 9   sell_price        float16 
 10  price_max         float16 
 11  price_min         float16 
 12  price_std         float16 
 13  price_mean        float16 
 14  price_norm        float16 
 15  price_nunique     float16 
 16  item_nunique      int16   
 17  price_momentum    float16 
 18  price_momentum_m  float16 
 19  price_momentum_y  float16 
 20  event_name_1      category
 21  event_type_1      category
 22  event_name_2      category
 23  event_type_2      category
 24  snap_CA           category
 25  snap_TX         

### Preparing data

In [4]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + \
            ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "sales","d"]
train_cols = grid_df.columns[~grid_df.columns.isin(useless_cols)]
X_train = grid_df[train_cols]
y_train = grid_df["sales"]

In [5]:
%%time

np.random.seed(777)

fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)

train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)

# This is a random sample, we're not gonna apply any time series train-test-split tricks here!
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], 
                              label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                              free_raw_data=False)

CPU times: user 25.6 s, sys: 4.77 s, total: 30.3 s
Wall time: 14 s


In [None]:
del grid_df, X_train, y_train, fake_valid_inds, train_inds
gc.collect()

In [7]:
params = {
            "objective" : "poisson",
            "metric" :"rmse",
            "force_row_wise" : True,
            "learning_rate" : 0.075,
    #         "sub_feature" : 0.8,
            "sub_row" : 0.75,
            "bagging_freq" : 1,
            "lambda_l2" : 0.1,
    #         "nthread" : 4
            "metric": ["rmse"],
            'verbosity': 1,
            'num_iterations' : 1200,
            'num_leaves': 2**11-1,
            "min_data_in_leaf":  2**12-1,
        }

In [None]:
%%time

m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=20)