In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
store_id = "TX_3"

In [None]:
df = pd.read_feather("/kaggle/input/m5-dataextracttransform-partb-tx/{}_v2.feather".format(store_id))

In [None]:
# Reduce memory overhead
for i in df.columns:
    column = df[i]
    if column.dtype == int:
        if np.max(column) > 127:
            df[i] = df[i].astype(np.int16)
        else:
            df[i] = df[i].astype(np.int8)

In [None]:
df.info()

In [None]:
y = df['label']
X = df.drop('label', axis = 1)

In [None]:
test_index = df['d'] > (1913 - 100)
train_index = np.logical_not(test_index)

yTrain = y[train_index]
yTest = y[test_index]

xTrain = X[train_index]
xTest = X[test_index]


In [None]:
import gc
del df, X, y
gc.collect()

In [None]:
import lightgbm as lgb

In [None]:
def train_weights(x):
    weights = (x/5)**5
    return weights/np.sum(weights)

lgb_train = lgb.Dataset(data = xTrain, 
                        label = yTrain,
                        categorical_feature = [0, 12, 13, 14, 15, 16, 17, 18, 19])
                        #weight = train_weights(xTrain['d']))


In [None]:
monotone_constraints = xTrain.columns.str.contains("sell_price").astype(int)

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    #'tweedie_variance_power': 1.1,
    'metric': 'rmse',
    'subsample': 0.5,
    'subsample_freq': 1,
    'learning_rate': 0.02,
    'num_leaves': (2**11)-1,
    'min_data_in_leaf': (2**12)-1,
    'feature_fraction': 0.7,
    'max_bin': 100,
    'n_estimators': 1500,
    #'categorical_feature': [0, 12, 13, 14, 15, 16, 17, 18, 19],
    #'monotone_constraints': monotone_constraints,
    #'monotone_constraints_method': 'intermediate'
}

In [None]:
print('Starting training...')

gbm = lgb.train(params,
                lgb_train,
                valid_sets = lgb_train,
                verbose_eval = 50)

In [None]:
print((np.mean((gbm.predict(xTest) - yTest)**2))**0.5)
print((np.mean((gbm.predict(xTrain) - yTrain)**2))**0.5)

In [None]:
FI = pd.DataFrame([xTrain.columns, gbm.feature_importance()/np.sum(gbm.feature_importance())]).transpose()
FI.columns = ['Variable', 'Impt']
FI.sort_values('Impt', ascending = False).head(15)

In [None]:
import joblib
joblib.dump(gbm, '{}_v9.pkl'.format(store_id))