In [2]:
import os
import pandas as pd
import numpy as np
from typing import Tuple, Dict, List, Optional
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [10]:
os.getcwd()

'/Users/dliang19/Projects/repos/sales_forecast/notebooks'

#### Load data for model training

In [20]:
# training data directory
data_dir = '../ml/data'
    
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'))

#### Split data into train and test set

In [50]:
# I cleaned up the original function used for creating time features and splitting data 
def split_data(train_data):
    # convert date string to datetime
    train_data['date'] = pd.to_datetime(train_data['date'])
    # test_data['date'] = pd.to_datetime(test_data['date'])

    # get time features
    train_data['month'] = train_data['date'].dt.month
    train_data['day'] = train_data['date'].dt.dayofweek
    train_data['year'] = train_data['date'].dt.year

    # test_data['month'] = test_data['date'].dt.month
    # test_data['day'] = test_data['date'].dt.dayofweek
    # test_data['year'] = test_data['date'].dt.year

    # col = [i for i in test_data.columns if i not in ['date','id']]
    col = ['store', 'item', 'month', 'day', 'year']
    y = 'sales'
    train_x, test_x, train_y, test_y = train_test_split(train_data[col],train_data[y], test_size=0.2, random_state=2018) # random split for time series???
    return (train_x, test_x, train_y, test_y,col)

# split data
train_x, test_x, train_y, test_y, col = split_data(train_df)

#### Model Training

In [51]:
def model(train_x,train_y,test_x,test_y,col):
    params = {
        'nthread': 10,
         'max_depth': 9,
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression_l1',
        'metric': 'mape', # this is abs(a-e)/max(1,a)
        'num_leaves': 64,
        'learning_rate': 0.2,
       'feature_fraction': 0.9,
       'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'lambda_l1': 3.097758978478437,
        'lambda_l2': 2.9482537987198496,
        'verbose': 1,
        'min_child_weight': 6.996211413900573,
        'min_split_gain': 0.037310344962162616,
        }
    
    lgb_train = lgb.Dataset(train_x,train_y)
    lgb_valid = lgb.Dataset(test_x,test_y)
    model = lgb.train(params, 
                      lgb_train, 
                      3000, 
                      valid_sets=[lgb_train, lgb_valid],
                      callbacks=[lgb.early_stopping(stopping_rounds=50),
                                 lgb.log_evaluation(50)])
    y_test = model.predict(test_df[col])
    return y_test,model

In [52]:
y_test, model = model(train_x,train_y,test_x,test_y,col)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003281 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 730400, number of used features: 5
[LightGBM] [Info] Start training from score 47.000000
Training until validation scores don't improve for 50 rounds
[50]	training's mape: 0.149378	valid_1's mape: 0.150001
[100]	training's mape: 0.137493	valid_1's mape: 0.138531
[150]	training's mape: 0.134397	valid_1's mape: 0.13578
[200]	training's mape: 0.132948	valid_1's mape: 0.13467
[250]	training's mape: 0.131924	valid_1's mape: 0.134041
[300]	training's mape: 0.131156	valid_1's mape: 0.133594
[350]	training's mape: 0.130671	valid_1's mape: 0.133409
[400]	training's mape: 0.130251	valid_1's mape: 0.133271
[450]	training's mape: 0.129901	valid_1's mape: 0.133194
[500]	training's mape: 0

#### Save model

In [53]:
model.save_model(filename='../ml/models/lgbm')

<lightgbm.basic.Booster at 0x2abbb4dd0>