In [1]:
import os
import gc
import time

import math
import datetime
from math import log, floor
from sklearn.neighbors import KDTree

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.utils import shuffle
from tqdm.notebook import tqdm as tqdm

import seaborn as sns
from matplotlib import colors
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import pywt
from statsmodels.robust import mad

import scipy
import statsmodels
from scipy import signal
import statsmodels.api as sm
from fbprophet import Prophet
from scipy.signal import butter, deconvolve
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, GradientBoostingRegressor
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

from typing import Union

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm
import os
import gc
import warnings

import pandas as pd
from pandas.plotting import register_matplotlib_converters
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

from multiprocessing import cpu_count
from joblib import Parallel, parallel_backend
from joblib import delayed
import multiprocessing

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def read_data():
    sell_prices_df = pd.read_csv(INPUT_DIR_PATH + 'sell_prices.csv')
    sell_prices_df = reduce_mem_usage(sell_prices_df)
    print('Sell prices has {} rows and {} columns'.format(sell_prices_df.shape[0], sell_prices_df.shape[1]))

    calendar_df = pd.read_csv(INPUT_DIR_PATH + 'calendar.csv')
    calendar_df = reduce_mem_usage(calendar_df)
    print('Calendar has {} rows and {} columns'.format(calendar_df.shape[0], calendar_df.shape[1]))

    sales_train_validation_df = pd.read_csv(INPUT_DIR_PATH + 'sales_train_evaluation.csv')
    print('Sales train validation has {} rows and {} columns'.format(sales_train_validation_df.shape[0], sales_train_validation_df.shape[1]))

    submission_df = pd.read_csv(INPUT_DIR_PATH + 'sample_submission.csv')
    return sell_prices_df, calendar_df, sales_train_validation_df, submission_df

In [3]:
INPUT_DIR_PATH = '../input/m5-forecasting-accuracy/'
prices, calendar, sales_train, submission_df = read_data()

Mem. usage decreased to 130.48 Mb (37.5% reduction)
Sell prices has 6841121 rows and 4 columns
Mem. usage decreased to  0.12 Mb (41.9% reduction)
Calendar has 1969 rows and 14 columns
Sales train validation has 30490 rows and 1947 columns


In [4]:
prices['store_id'].unique()

array(['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1',
       'WI_2', 'WI_3'], dtype=object)

In [5]:
dataset = sales_train

del sales_train
gc.collect()

55

## check data

In [6]:
from math import ceil

calendar['date'] = pd.to_datetime(calendar['date'])
calendar['day'] = calendar['date'].dt.day
calendar['month'] = calendar['date'].dt.month
calendar['year'] = calendar['date'].dt.year
calendar['week'] = calendar['date'].dt.week
calendar['tm_wm'] = calendar['day'].apply(lambda x: ceil(x/7))

calendar.drop(['date','weekday','d','snap_TX','snap_WI'], inplace = True, axis = 1)
calendar.head()

Unnamed: 0,wm_yr_wk,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,day,week,tm_wm
0,11101,1,1,2011,,,,,0,29,4,5
1,11101,2,1,2011,,,,,0,30,4,5
2,11101,3,1,2011,,,,,0,31,5,5
3,11101,4,2,2011,,,,,1,1,5,1
4,11101,5,2,2011,,,,,1,2,5,1


In [7]:
prices = prices[(prices['store_id']=='CA_1') | (prices['store_id']=='CA_2')]

# We can do some basic aggregations
prices['price_max'] = prices.groupby(['store_id','item_id'])['sell_price'].transform('max')
prices['price_min'] = prices.groupby(['store_id','item_id'])['sell_price'].transform('min')
prices['price_std'] = prices.groupby(['store_id','item_id'])['sell_price'].transform('std')
prices['price_mean'] = prices.groupby(['store_id','item_id'])['sell_price'].transform('mean')

# and do price normalization (min/max scaling)
prices['price_norm'] = prices['sell_price']/prices['price_max']

# Some items are can be inflation dependent
# and some items are very "stable"
prices['price_nunique'] = prices.groupby(['store_id','item_id'])['sell_price'].transform('nunique')
prices['item_nunique'] = prices.groupby(['store_id','sell_price'])['item_id'].transform('nunique')

# I would like some "rolling" aggregations
# but would like months and years as "window"
calendar_prices = calendar[['wm_yr_wk','month','year']]
calendar_prices = calendar_prices.drop_duplicates(subset=['wm_yr_wk'])
prices = prices.merge(calendar_prices[['wm_yr_wk','month','year']], on=['wm_yr_wk'], how='left')
del calendar_prices

# Now we can add price "momentum" (some sort of)
# Shifted by week 
# by month mean
# by year mean
prices['price_momentum'] = prices['sell_price']/prices.groupby(['store_id','item_id'])['sell_price'].transform(lambda x: x.shift(1))
prices['price_momentum_m'] = prices['sell_price']/prices.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')
prices['price_momentum_y'] = prices['sell_price']/prices.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')

del prices['month'], prices['year']

prices["id"] = prices['item_id'] + '_' +prices['store_id']+'_evaluation'
prices.drop(['store_id','item_id'],inplace=True,axis=1)
prices.tail()

Unnamed: 0,wm_yr_wk,sell_price,price_max,price_min,price_std,price_mean,price_norm,price_nunique,item_nunique,price_momentum,price_momentum_m,price_momentum_y,id
1335802,11617,1.0,1.0,1.0,0.0,1.0,1.0,1.0,125,1.0,1.0,1.0,FOODS_3_827_CA_2_evaluation
1335803,11618,1.0,1.0,1.0,0.0,1.0,1.0,1.0,125,1.0,1.0,1.0,FOODS_3_827_CA_2_evaluation
1335804,11619,1.0,1.0,1.0,0.0,1.0,1.0,1.0,125,1.0,1.0,1.0,FOODS_3_827_CA_2_evaluation
1335805,11620,1.0,1.0,1.0,0.0,1.0,1.0,1.0,125,1.0,1.0,1.0,FOODS_3_827_CA_2_evaluation
1335806,11621,1.0,1.0,1.0,0.0,1.0,1.0,1.0,125,1.0,1.0,1.0,FOODS_3_827_CA_2_evaluation


In [8]:
train_dataset = pd.concat([dataset, submission_df.iloc[:30490,1:]],axis=1)
train_dataset = train_dataset[(train_dataset['store_id']=='CA_1') | (train_dataset['store_id']=='CA_2')]
# train_dataset.drop(['store_id'], inplace = True, axis = 1)
train_dataset.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
6093,FOODS_3_823_CA_2_evaluation,FOODS_3_823,FOODS_3,FOODS,CA_2,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6094,FOODS_3_824_CA_2_evaluation,FOODS_3_824,FOODS_3,FOODS,CA_2,CA,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6095,FOODS_3_825_CA_2_evaluation,FOODS_3_825,FOODS_3,FOODS,CA_2,CA,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6096,FOODS_3_826_CA_2_evaluation,FOODS_3_826,FOODS_3,FOODS,CA_2,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6097,FOODS_3_827_CA_2_evaluation,FOODS_3_827,FOODS_3,FOODS,CA_2,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Transform - calendar, train, prices - 

In [9]:
from sklearn import preprocessing

def transform_calendar(data):
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)  
    cat = [ 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    return data

def transform_train(data):   
    cat = ['id', 'item_id', 'dept_id', 'cat_id','store_id','state_id']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    return data

def transform_prices(data):  
    cat = ['id']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    return data

## train
train_dataset = transform_train(train_dataset)
train_dataset.head()

## price
prices= transform_prices(prices)
prices.head()

## calendar
calendar = transform_calendar(calendar)
calendar = reduce_mem_usage(calendar)
calendar = calendar[['wm_yr_wk','tm_wm','week','wday', 'year','month','day',
                     'event_name_1','event_type_1','event_name_2','event_type_2',
                     'snap_CA']]
calendar.tail()

Mem. usage decreased to  0.03 Mb (81.5% reduction)


Unnamed: 0,wm_yr_wk,tm_wm,week,wday,year,month,day,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA
1964,11620,3,24,5,2016,6,15,30,4,4,2,0
1965,11620,3,24,6,2016,6,16,30,4,4,2,0
1966,11620,3,24,7,2016,6,17,30,4,4,2,0
1967,11621,3,24,1,2016,6,18,30,4,4,2,0
1968,11621,3,24,2,2016,6,19,16,3,2,0,0


## Id + Prices + Calendar - for each Item

In [10]:
# pd.merge(calendar, prices,  on='wm_yr_wk', how='outer')

prices_array = prices.values
identifiers = train_dataset.iloc[:,:6].values
IDENTIFIER = train_dataset.iloc[:,0].values

n_out = 1

start = 0

list_cal_x = []
list_cal_z = []

for ident in tqdm(identifiers):
    
    ID = ident[0]


    ident_array = np.array([[ident[1:]]*calendar.shape[0]][0])

    prices_id = pd.DataFrame(prices_array[prices_array[:,-1]==ID],
                             columns=['wm_yr_wk','sell_price','price_max',
                                      'price_min','price_std','price_mean',
                                      'price_norm','price_nunique','item_nunique',
                                      'price_momentum','price_momentum_m','price_momentum_y','id']).drop('id',axis=1)

    calendar_price = pd.merge(calendar, prices_id, on='wm_yr_wk', how='outer').drop('wm_yr_wk',axis=1)
    
    array_ident = np.array([[ident]*calendar_price.shape[0]][0])
    calendar_price = calendar_price[calendar_price.columns[::-1]].values.astype(np.float32)

    calendar_price = np.hstack((ident_array,calendar_price))

    calendar_x = calendar_price[start:-27,:]#[start+1:-54,:]
    calendar_z = calendar_price[-27:,:]
    
    list_cal_x.append(calendar_x)
    list_cal_z.append(calendar_z)
    
    del array_ident, ident_array, calendar_x, calendar_z, calendar_price, prices_id
#     gc.collect()
    
    
del calendar, prices
gc.collect()

HBox(children=(FloatProgress(value=0.0, max=6098.0), HTML(value='')))




3

## Functions for Statistics Lags - Mean, Std

In [11]:
from scipy.ndimage.interpolation import shift
from scipy.stats import gmean
from scipy.stats import skew
from scipy.stats import kurtosis


# def embedded_recursive(data, calendar_x, n_out=1, n_in_0=1,
#                        n_in_1=1,
#                        n_in_2=1,
#                        n_in_3=1,
#                        n_in_4=1,dropnan=True):
    
# #     n_vars = 1 if type(data) is list else data.shape[1]
#     cols_pred, cols_0, cols_1, cols_2, cols_3, cols_4= list(),list(),list(),list(),list(),list()
    
#     for i in range(0, n_out):
#         shif_out = shift(data, -i, mode='constant', cval=-2000).astype(np.float16)
#         shif_out[shif_out ==-2000] = np.nan
#         cols_pred.append(shif_out)    
#     for i in range(28 + n_in_0, 28, -1):
#         shif_0 = shift(data, i, mode='constant', cval=-2000).astype(np.float16)
#         shif_0 [shif_0 ==-2000] = np.nan
#         cols_0.append(shif_0)        
#     for i in range(28 + n_in_1, 28,-1):
#         shif_1 = shift(data, i, mode='constant', cval=-2000).astype(np.float16)
#         shif_1 [shif_1 ==-2000] = np.nan
#         cols_1.append(shif_1)        
#     for i in range(28 + n_in_2, 28, -1):
#         shif_2 = shift(data, i, mode='constant', cval=-2000).astype(np.float16)
#         shif_2 [shif_2 ==-2000] = np.nan
#         cols_2.append(shif_2)        
#     for i in range(28 + n_in_3, 28, -1):
#         shif_3 = shift(data, i, mode='constant', cval=-2000).astype(np.float16)
#         shif_3 [shif_3 ==-2000] = np.nan
#         cols_3.append(shif_3)    
#     for i in range(28 + n_in_4, 28, -1):
#         shif_4 = shift(data, i, mode='constant', cval=-2000).astype(np.float32)
#         shif_4 [shif_4 ==-2000] = np.nan
#         cols_4.append(shif_4)        
     
   
     # put it all together
#     agg_out_pred = np.transpose(np.vstack(cols_pred))
#     agg_out_0 = np.transpose(np.vstack(cols_0))    
#     agg_out_1 = np.transpose(np.vstack(cols_1))    
#     agg_out_2 = np.transpose(np.vstack(cols_2))    
#     agg_out_3 = np.transpose(np.vstack(cols_3))    
#     agg_out_4 = np.transpose(np.vstack(cols_4))
    
#     med_0 = np.mean(agg_out_0,axis=1).reshape(-1,1)
#     std_0 = np.std(agg_out_0,axis=1).reshape(-1,1)
    
#     med_1 = np.mean(agg_out_1,axis=1).reshape(-1,1)
#     std_1 = np.std(agg_out_1,axis=1).reshape(-1,1)
    
#     med_2 = np.mean(agg_out_2,axis=1).reshape(-1,1)
#     std_2 = np.std(agg_out_2,axis=1).reshape(-1,1)
    
#     med_3 = np.mean(agg_out_3,axis=1).reshape(-1,1)
#     std_3 = np.std(agg_out_3,axis=1).reshape(-1,1)
    
#     med_4 = np.mean(agg_out_4,axis=1).reshape(-1,1)
#     std_4 = np.std(agg_out_4,axis=1).reshape(-1,1)


#     XY = np.hstack((std_4, std_3, std_2, std_1,std_0,
#                     med_4, med_3, med_2, med_1,med_0, 
#                     agg_out_pred))
    
#     del agg_out_1, agg_out_2,agg_out_3, agg_out_4,
#     std_4, std_3, std_2, std_1,std_0, med_4, med_3, med_2, med_1, med_0, agg_out_pred

#     # calendar(& prices) + statistics
#     XY_cal = np.hstack((calendar_x, XY))
    
#     XY_train = XY_cal[:-55,:]
    
#     XY_tr_val = XY_train[:-1,:]
#     X_test = XY_train[-1,:-1]
    
#     del XY, XY_cal, XY_train, calendar_x

#     return XY_tr_val, X_test

In [12]:
def calendar_statistics_merge(data, calendar_x,
                       n_out=1,
                       n_lags = 1, 
                       n_in_0=1,
                       n_in_1=1,
                       n_in_2=1,
                       n_in_3=1,
                       n_in_4=1,dropnan=True):

    cols_pred = list()  
    for i in range(0, n_out):
        shif_out = shift(data, -i, mode='constant', cval=-2000).astype(np.float32)
        shif_out[shif_out ==-2000] = np.nan
        cols_pred.append(shif_out)
    agg_out_pred = np.transpose(np.vstack(cols_pred))
    XY = np.hstack(agg_out_pred).reshape(-1,1)
    
    DAY = 27
    
    LAGGING = n_lags
    cols = list() 
    for i in range(LAGGING+DAY, DAY, -1):
        shif = shift(data, i, mode='constant', cval=-2000).astype(np.float16)
        shif[shif ==-2000] = np.nan
        cols.append(shif)
    LAGGING_DAY = np.transpose(np.vstack(cols)) 
    del cols
    XY = np.hstack((LAGGING_DAY, XY))


    lags = [n_in_0, n_in_1, n_in_2, n_in_3, n_in_4]
    for lag in lags:
        cols_0 = list() 
        for i in range( lag+DAY, DAY, -1):
            shif_0 = shift(data, i, mode='constant', cval=-2000).astype(np.float32)
            shif_0[shif_0 ==-2000] = np.nan
            cols_0.append(shif_0)
        LAG = np.transpose(np.vstack(cols_0)) 
        med_0 = np.mean(LAG, axis=1).reshape(-1,1)
        std_0 = np.std(LAG, axis=1).reshape(-1,1)
        del cols_0, LAG
        XY = np.hstack((std_0, med_0, XY))

    shifts = [0, 6, 13]
    for day in shifts:
        lags = [n_in_0, n_in_1, n_in_2, n_in_3]
        for lag in lags:
            cols = list() 
            for i in range(lag +day, day, -1):
                shif = shift(data, i, mode='constant', cval=-2000).astype(np.float32)
                shif[shif ==-2000] = np.nan
                cols.append(shif)
            LAG_SHIFT = np.transpose(np.vstack(cols)) 
            med_0 = np.mean(LAG_SHIFT, axis=1).reshape(-1,1)
            del cols, LAG_SHIFT
            XY = np.hstack((med_0, XY))

    XY_cal = np.hstack((calendar_x, XY))

    XY_train = XY_cal

    XY_tr_val = XY_train[27+n_in_4:-1,:] #27+n_in_4
    
    X_test = XY_train[-1,:-1]

    del XY, XY_cal, XY_train, calendar_x

    return XY_tr_val, X_test

## construct X,Y,Z with identifiers, calendar and price

In [13]:
n_out = 1
n_lags = 15
n_0 = 7
n_1 = 14
n_2 = 30
n_3 = 60
n_4 = 180

X_frame = []
Z_frame = []


t = 0

for row in tqdm(train_dataset.iloc[:,(start+6):-27].values):

    calendar_x = list_cal_x[t]#[1:,:]

    X, Z = calendar_statistics_merge(row , calendar_x, n_out, n_lags, n_0, n_1, n_2, n_3, n_4)

    X = X.astype(np.float32)
    Z = Z.astype(np.float32)

    X_frame.append(X)
    Z_frame.append(Z)

    del X, Z, calendar_x, row    
    t=t+1

window_dataset = train_dataset.iloc[:,-27-n_4-28:-28].values

del list_cal_x, train_dataset
gc.collect()



HBox(children=(FloatProgress(value=0.0, max=6098.0), HTML(value='')))




23

In [14]:
X_final = np.vstack(X_frame)
Z_final = np.vstack(Z_frame)

del X_frame, Z_frame
gc.collect()

20

In [15]:
# pd.DataFrame(X)

In [16]:
# cols_pred = list()  
# for i in range(0, n_out):
#     shif_out = shift(row, -i, mode='constant', cval=-2000).astype(np.float32)
#     shif_out[shif_out ==-2000] = np.nan
#     cols_pred.append(shif_out)
# agg_out_pred = np.transpose(np.vstack(cols_pred))
# XY = np.hstack(agg_out_pred).reshape(-1,1)


# DAY = 1

# LAGGING = 7
# cols = list() 
# for i in range(LAGGING+DAY, DAY, -1):
#     shif = shift(row, i, mode='constant', cval=-2000).astype(np.float16)
#     shif[shif ==-2000] = np.nan
#     cols.append(shif)
# LAGGING_DAY = np.transpose(np.vstack(cols)) 
# XY = np.hstack((LAGGING_DAY, XY))
# pd.DataFrame(XY[-18:,:])

In [17]:
# row

In [18]:
# train_dataset.iloc[:,6:-56]

In [19]:
# pd.DataFrame(X_frame[0].astype(np.float32)).iloc[:,25:].tail()

In [20]:
# pd.DataFrame(X_frame[0].astype(np.float16)).iloc[:,25:].tail()

In [21]:
# pd.DataFrame(X_frame[0]).info()
# X_frame[0].shape
#18 -day
#19 -month
#20 - year

In [22]:
import pickle

# save pickle file
with open('1_X_CA.pkl','wb') as x:
        pickle.dump(X_final, x)
        
with open('1_Z_CA.pkl','wb') as z:
        pickle.dump(Z_final, z)
        
with open('1_window_CA.pkl','wb') as w:
        pickle.dump(window_dataset, w)
        
with open('1_list_cal_CA.pkl','wb') as cal:
        pickle.dump(np.array(list_cal_z), cal)

In [23]:
# import pickle

# with open("../input/1-stats-id/2_X_CA.pkl", "rb") as Z_WI:
#     X_final = Z_WI.read()
    
# with open("../input/1-stats-id/2_Z_CA.pkl", "rb") as window_WI:
#     Z_final = window_WI.read()
    
# with open("../input/1-stats-id/2_list_cal_CA.pkl", "rb") as calendar_WI:
#     calendar_3 = calendar_WI.read()

## reduce memory and save in pkl or npy, npz

In [24]:
# X = X_final #reduce_mem_usage(pd.DataFrame(X_final)).values
# Z = Z_final #reduce_mem_usage(pd.DataFrame(Z_final)).values

# del X_final,Z_final
# gc.collect()

In [25]:
# X = pickle.loads(X)
# Z = pickle.loads(Z)

## Divide in training and validation sets

In [26]:
# mask_XYtrain = np.logical_or(X[:,11]<=27,X[:,12]<=3,X[:,13]<=2016)

# # <= '2016-03-27' ~
# XY_train = X[mask_XYtrain]
# XY_val = X[~mask_XYtrain]
# X_train = XY_train[:,:-1]
# y_train = XY_train[:,-1]
# X_val = XY_val[:,:-1]
# y_val = XY_val[:,-1]

In [27]:
# # # define random hyperparammeters
# params = {'boosting_type': 'gbdt',
# #           'objective': 'tweedie',
# #           'tweedie_variance_power': 1.1,
#           'objective': 'regression',
#           'metric': 'rmse',
#            'subsample': 0.5,
#           'seed': 42,
#           'subsample_freq': 1,
#           'learning_rate': 0.03,
#           'num_leaves': 2**11-1,
#           'min_data_in_leaf': 2**12-1,
#           'feature_fraction': 0.6,
#           'max_bin': 100,
#         'n_estimators': 1400,
#         'boost_from_average': False,
#                 'verbose': -1,
#                 } 



# train_set = lgb.Dataset(X_train, y_train.reshape(-1))
# val_set = lgb.Dataset(X_val, y_val.reshape(-1))

# # del x_train, y_train

# # seed_everything(SEED)
# model = lgb.train(params, train_set,
#                   valid_sets = [train_set, val_set],
#                 num_boost_round = 2500,
#                   early_stopping_rounds = 60,
#                   verbose_eval = 100)

# # model = lgb.cv(params, train_set, nfold=5,
# #                   num_boost_round = 2500,
# #                   early_stopping_rounds = 20,
# #                   verbose_eval = 100)

# model.save_model('2_model_CA.txt')


## Feature Importance

In [28]:
# fig, ax = plt.subplots(figsize=(10, 7))
# lgb.plot_importance(model, max_num_features=10, ax=ax)
# plt.title("LightGBM - Feature Importance");

## Function to update statistics

In [29]:
# def update_statistics(data, n_in_0=1, n_in_1=1, n_in_2=1, n_in_3=1,n_in_4=1, n_in_5=1, n_in_6=1,dropnan=True):
    
# #     n_vars = 1 if type(data) is list else data.shape[1]
#     cols_0, cols_1, cols_2, cols_3, cols_4 , cols_5, cols_6 = list(),list(),list(), list(), list(),list(), list()
    
#     for i in range(n_in_0, 0,-1):
#         shif_0 = shift(data, i, mode='constant', cval=-2000).astype(float)
#         shif_0 [shif_0 ==-2000] = np.nan
#         cols_0.append(shif_0)
        
#     for i in range(n_in_1,0,-1):
#         shif_1 = shift(data, i, mode='constant', cval=-2000).astype(float)
#         shif_1 [shif_1 ==-2000] = np.nan
#         cols_1.append(shif_1)
        
#     for i in range(n_in_2, 0, -1):
#         shif_2 = shift(data, i, mode='constant', cval=-2000).astype(float)
#         shif_2 [shif_2 ==-2000] = np.nan
#         cols_2.append(shif_2)
        
#     for i in range(n_in_3, 0, -1):
#         shif_3 = shift(data, i, mode='constant', cval=-2000).astype(float)
#         shif_3 [shif_3 ==-2000] = np.nan
#         cols_3.append(shif_3)
    
#     for i in range(n_in_4, 0, -1):
#         shif_4 = shift(data, i, mode='constant', cval=-2000).astype(float)
#         shif_4 [shif_4 ==-2000] = np.nan
#         cols_4.append(shif_4)
        
#     for i in range(n_in_5, 0, -1):
#         shif_5 = shift(data, i, mode='constant', cval=-2000).astype(float)
#         shif_5 [shif_5 ==-2000] = np.nan
#         cols_5.append(shif_5)
    
#     for i in range(n_in_6, 0, -1):
#         shif_6 = shift(data, i, mode='constant', cval=-2000).astype(float)
#         shif_6 [shif_6 ==-2000] = np.nan
#         cols_6.append(shif_6)
        
        
#     agg_out_0 = np.transpose(np.vstack(cols_0))
#     agg_out_1 = np.transpose(np.vstack(cols_1))
#     agg_out_2 = np.transpose(np.vstack(cols_2))
#     agg_out_3 = np.transpose(np.vstack(cols_3))
#     agg_out_4 = np.transpose(np.vstack(cols_4))
#     agg_out_5 = np.transpose(np.vstack(cols_5))
#     agg_out_6 = np.transpose(np.vstack(cols_6))
    
#     med_0 = np.median(agg_out_0,axis=1).reshape(-1,1)
#     med_1 = np.median(agg_out_1,axis=1).reshape(-1,1)
#     med_2 = np.median(agg_out_2,axis=1).reshape(-1,1)
#     med_3 = np.median(agg_out_3,axis=1).reshape(-1,1)
#     med_4 = np.median(agg_out_4,axis=1).reshape(-1,1)
#     med_5 = np.median(agg_out_5,axis=1).reshape(-1,1)
#     med_6 = np.median(agg_out_6,axis=1).reshape(-1,1)
        
# #     gmean_0 = gmean(agg_out_0,axis=1).reshape(-1,1)
# #     gmean_1 = gmean(agg_out_1,axis=1).reshape(-1,1)
# #     gmean_2 = gmean(agg_out_2,axis=1).reshape(-1,1)
# #     gmean_3 = gmean(agg_out_3,axis=1).reshape(-1,1)
# #     gmean_4 = gmean(agg_out_4,axis=1).reshape(-1,1)
# #     gmean_5 = gmean(agg_out_5,axis=1).reshape(-1,1)
# #     gmean_6 = gmean(agg_out_6,axis=1).reshape(-1,1)
    
# #     skew_6 = skew(agg_out_6,axis=1).reshape(-1,1)
# #     kurt_6 = kurtosis(agg_out_6,axis=1).reshape(-1,1)

#     statistics = np.hstack((#kurt_6, skew_6, gmean_6, gmean_5, gmean_4, gmean_3, gmean_2, gmean_1, gmean_0,
#                     med_6, med_5, med_4, med_3, med_2, med_1, med_0))


#     updated_stat = statistics[-1,:]#.reshape(1,-1)
    
#     return updated_stat