### Training method: Neural Network, XGBoost, LGBoosting, Liniar Regression
### The most important features are: 

### How long it takes to train your model

In [1]:
pip freeze


The following command must be run outside of the IPython shell:

    $ pip freeze

The Python package manager (pip) can only be used from outside of IPython.
Please reissue the `pip` command in a separate terminal or command prompt.

See the Python documentation for more information on how to install packages:

    https://docs.python.org/3/installing/


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime
from workalendar.europe import Russia
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import gc
from itertools import product
from datetime import date
import time

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression, BayesianRidge, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from xgboost import XGBRegressor
from xgboost import plot_importance

from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras import optimizers
from keras.models import load_model

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

def mean_enc(all_data, group_cols, target_col, name, fill_c):
    gb = all_data[group_cols + [target_col]].groupby(group_cols,as_index=False).agg({target_col:{name + '_mean':'mean'}})
    #gb.reset_index(inplace=True)
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=group_cols).fillna(fill_c)
    return all_data
  
def mean_KFenc(data, enc_col, target_col, fill_c, n_splits):
    data= data.copy()
    y= data[target_col]; X= data[enc_col]
    new_col= enc_col + '_mean_skf'
    data[new_col]= fill_c
    skf = StratifiedKFold(n_splits=n_splits, shuffle= False, random_state=0)
    for train_index, test_index in skf.split(X, y):
        enc= data.loc[train_index].groupby(enc_col)[target_col].mean()
        #enc.reset_index(inplace=True)
        data[new_col].loc[test_index]= data[enc_col].loc[test_index].map(enc)    
    data[new_col].fillna(fill_c, inplace=True)
    return data

def feature_lags(all_data, time_col, inx_cols, name, lags, fill_c):  
    for lag in lags:
        d= all_data[inx_cols + [name]].copy().drop_duplicates(subset=inx_cols)
        d[time_col] += lag
        d= d.rename(columns= {name: name + '_previous_' + str(lag)})
        all_data = pd.merge(all_data, d, how='left', on=inx_cols).fillna(fill_c)
    del d
    return all_data

def first_last(all_data, p_cols, reference, name, fl):
    gb= all_data[p_cols + [reference]].groupby(p_cols, as_index=False)
    #gb.reset_index(inplace=True)
    if fl== 'first': 
        a=gb.first()
    if fl== 'last':
        a=gb.last()
    a= a.rename(columns={reference:fl + '_' + name})
    all_data = pd.merge(all_data, a, how='left', on=p_cols)
    if fl== 'first': 
        all_data['delta_' + fl + '_' + name]= (all_data[reference] - all_data[fl + '_' + name]).fillna(0)
    if fl== 'last': 
        all_data['delta_' + fl + '_' + name]= (all_data[reference] - all_data[fl + '_' + name]).fillna(33)
    return all_data

fill_c= 0

In [3]:
#read_data
path= 'Data/'
t          = pd.read_csv(path + 'sales_train.csv.gz')
test            = pd.read_csv(path + 'test.csv')
items           = pd.read_csv(path + 'items.csv')
item_cats = pd.read_csv(path + 'item_categories.csv')
shops           = pd.read_csv(path + 'shops.csv')

t['date']= pd.to_datetime(t['date'], format= '%d.%m.%Y')
t= t.sort_values('date')

# EDA

## Sales

In [5]:
# Checking whether there ares new items in test data.
col= 'item_id'
new_items=len(list(set(test[col]) - set(test[col]).intersection(set(t[col]))))
new_items

363

In [6]:
#plt.scatter(t['item_id'], t['item_price'], s=0.5); plt.show()

In [7]:
#plt.scatter(t['item_id'], t['item_cnt_day'], s=0.5); plt.show()

## Shops
#### There seems to be a duplications of shops 0 with 57, 1 with 58 & 10 with 11.

In [8]:
# Checking wether there are new shops in test data.
col= 'shop_id'
new_shops=list(set(test[col]) - set(test[col]).intersection(set(shops[col])))
new_shops

[]

In [9]:
gb = shops.groupby('shop_name')
 
gb.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4
5,"Вологда ТРЦ ""Мармелад""",5
6,"Воронеж (Плехановская, 13)",6
7,"Воронеж ТРЦ ""Максимир""",7
8,"Воронеж ТРЦ Сити-Парк ""Град""",8
9,Выездная Торговля,9


## items

In [10]:
#gb = items.groupby('item_name')
#gb.head()

## item_category

In [11]:
#item_cats.head(10)

# Clean

In [4]:
# Cleaning and ordering based on EDA

# removing outliers
t = t[t['item_price']<50000]
t = t[t['item_cnt_day']<500]

# 

t.loc[t['shop_id'] == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57

t.loc[t['shop_id'] == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58

t.loc[t['shop_id'] == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]

items.drop(['item_name'], axis=1, inplace=True)



item_cats['split'] = item_cats['item_category_name'].str.split('-')
item_cats['type'] = item_cats['split'].map(lambda x: x[0].strip())
item_cats['type_code'] = LabelEncoder().fit_transform(item_cats['type'])
# if subtype is nan then type
item_cats['subtype'] = item_cats['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
item_cats['subtype_code'] = LabelEncoder().fit_transform(item_cats['subtype'])
item_cats = item_cats[['item_category_id','type_code', 'subtype_code']]

In [5]:
tr_shops= list(t['shop_id'].unique())
ts_shops= list(test['shop_id'].unique())
tr_items= list(t['item_id'].unique())
ts_items= list(test['item_id'].unique())

# Check leakedges

In [13]:
#test.columns

In [14]:
#plt.scatter(t.index, t['item_cnt_day'], s=0.5); plt.show()

In [15]:
#plt.scatter(t['shop_id'], t['item_cnt_day'], s=0.5); plt.show()

In [16]:
#plt.scatter(test['shop_id'], test['ID'], s=0.5);plt.show()

In [17]:
#plt.figure(figsize=(15,15))
#plt.scatter(test['shop_id'], test['item_id'], s=0.5);plt.show()

In [18]:
#plt.figure(figsize=(15,15))
#plt.scatter(test['item_id'], test['ID'], s=0.5);plt.show()

#### There are some patterns that require further investigation in the future. 

# Create basic dataset for processing
#### For each item in each shop I create an instance for each month. This way there will be a continious data of each specific item for creating features and training the model.  
#### Test data will be concatenated to train data for easier processing.

In [6]:
# Create grid
index_cols = ['shop_id', 'item_id', 'date_block_num']

grid = [] 
for block_num in t['date_block_num'].unique():
    cur_shops = t.loc[t['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = t.loc[t['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols, dtype=np.int32)


grid['ID']= -1

grid= grid[grid['shop_id'].isin(ts_shops)]
grid= grid[grid['item_id'].isin(ts_items)]


g= grid.shape
print('grid', g)


# Preparing test set
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

keep= len(grid)+len(test)
print('keep: ', keep)

grid (2781025, 4)
keep:  2995225


In [7]:
# Concatenating grid & test data
all_data= pd.concat([grid, test], axis=0, ignore_index=True)#, sort=False)
all_data.fillna(0, inplace=True)
del grid

# Merging Shops, Categories, items 
all_data = pd.merge(all_data, shops, on=['shop_id'], how='left')
all_data = pd.merge(all_data, items, on=['item_id'], how='left')
all_data = pd.merge(all_data, item_cats, on=['item_category_id'], how='left')
all_data['item_category_id'] = all_data['item_category_id'].astype(np.int8)
all_data['city_code'] = all_data['city_code'].astype(np.int8)
all_data['type_code'] = all_data['type_code'].astype(np.int8)
all_data['subtype_code'] = all_data['subtype_code'].astype(np.int8)

print(all_data.shape)

(2995225, 8)


### Calculate target data
#### I will clip target to 0-20 for better convergance of the model.

In [8]:
# Target of current month
group_cols= ['shop_id', 'item_id', 'date_block_num']
gb = t.groupby(group_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=group_cols).fillna(0)

# Clipping
all_data['target']= all_data['target'].clip(0, 20, axis=0).astype(np.int8)
print(all_data.shape)

(2995225, 9)


# Features
#### * Months & year.
#### * Months since firs & last sales.
#### * Mean price for each month, price lags & trends.
#### * Working days and holydays in each month.
#### * Mean encoding using skf strategy.
#### * Target lags, trends & estimations.

In [9]:
# Calculate month & year
all_data['month']= (all_data['date_block_num'] % 12 + 1).astype(np.int32)
all_data['year']= (all_data['date_block_num'] // 12 + 2013).astype(np.int32)

# months since first & last sale
reference= 'date_block_num'

p_cols=['shop_id', 'item_id']; name= 'shop_item'; fl= 'first'
all_data= first_last(all_data, p_cols, reference, name, fl)

p_cols=['shop_id', 'item_id']; name= 'shop_item'; fl= 'last'
all_data= first_last(all_data, p_cols, reference, name, fl)

p_cols=['shop_id']; name= 'shop'; fl= 'first'
all_data= first_last(all_data, p_cols, reference, name, fl)

p_cols=['shop_id']; name= 'shop'; fl= 'last'
all_data= first_last(all_data, p_cols, reference, name, fl)

p_cols=['item_id']; name= 'item'; fl= 'first'
all_data= first_last(all_data, p_cols, reference, name, fl)

p_cols=['item_id']; name= 'item'; fl= 'last'
all_data= first_last(all_data, p_cols, reference, name, fl) 

In [10]:
# Prices
time_col= 'date_block_num'
inx_cols= ['shop_id', 'item_id', 'date_block_num']
lags= [1, 2, 3, 4, 6, 9, 12]
gb= t[inx_cols + ['item_price']].groupby(inx_cols, as_index=False).mean()
all_data= pd.merge(all_data, gb, how= 'left', on=inx_cols) 
all_data= feature_lags(all_data, time_col, inx_cols, 'item_price', lags, fill_c)
for l in lags[1:]:
    all_data['item_price_trend_' + str(l)]= all_data['item_price_previous_1'] - all_data['item_price_previous_' + str(l)]
all_data.drop('item_price', axis=1, inplace=True)
print('prices')

prices


In [11]:
#Time features    
days_in_month = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
all_data['month'] -=1
all_data['days'] = all_data['month'].map(days_in_month).astype(np.int8)
all_data['month'] +=1
cal = Russia()

y= 2014
all_data['w_days']= 0

for m, last in enumerate(days_in_month):
    all_data['w_days'][all_data['month']==m+1]= cal.get_working_days_delta(date(y, m+1, 1), date(y, m+1, last))
all_data['h_days']= all_data['days'] - all_data['w_days'] - 8
gc.collect();
print(all_data.shape)


all_data['date_block_num'] += 1

print('time')
print(all_data.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(2995225, 39)
time
(2995225, 39)


In [12]:
# SKF Mean Encoding    
inx_cols= ['shop_id', 'item_id', 'date_block_num']
time_col= 'date_block_num'
target_col= 'target'

skf_cols= ['shop_id', 'item_id', 'item_category_id', 'type_code', 'subtype_code', 'city_code']
data1= all_data[all_data['ID']==-1]
data2= all_data[all_data['ID']>-1]
for skf_col in skf_cols:
    data1= mean_KFenc(data1, skf_col, target_col, fill_c, n_splits=5)
    data2= pd.merge(data2, data1[[skf_col, skf_col + '_mean_skf']].drop_duplicates(subset=skf_col, keep='last'), how='left', on=skf_col)
all_data= pd.concat([data1, data2], axis=0)

print('skf_enc')
print('all_data shape: ', all_data.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFram

skf_enc
all_data shape:  (2995225, 45)


In [None]:
# Pairs Mean Encoding
group_cols= ['shop_id', 'item_id']; name= 'target_shop_item'
all_data= mean_enc(all_data, group_cols, target_col, name, fill_c)

group_cols= ['shop_id', 'city_code']; name= 'target_shop_city'
all_data= mean_enc(all_data, group_cols, target_col, name, fill_c)

group_cols= ['shop_id', 'item_category_id']; name= 'target_shop_cat'
all_data= mean_enc(all_data, group_cols, target_col, name, fill_c)

group_cols= ['shop_id', 'type_code']; name= 'target_shop_type'
all_data= mean_enc(all_data, group_cols, target_col, name, fill_c)

group_cols= ['shop_id', 'subtype_code']; name= 'target_shop_subtype'
all_data= mean_enc(all_data, group_cols, target_col, name, fill_c)

print('mean encoding')
print('all_data shape: ', all_data.shape)

mean encoding
all_data shape:  (2995225, 50)


In [None]:
# Target lags
group_cols= ['shop_id', 'item_id', 'date_block_num']
lags= [1, 2, 3, 4, 5, 6, 9, 12]
for i in lags:       
    d= all_data[group_cols + ['target']].copy().drop_duplicates()
    d['date_block_num'] += i
    d= d.rename(columns= {'target':'target_previous_' + str(i)})
    all_data = pd.merge(all_data, d, how='left', on=group_cols).fillna(0)
del d
print('target lags')
print('all_data shape: ', all_data.shape)

# Target trend
inx_cols= ['shop_id', 'item_id', 'date_block_num']
time_col= 'date_block_num'
target_col= 'target'
trend_lags= [2, 3, 4, 5, 6, 9, 12]
for l in trend_lags:
    all_data['target_trend_' + str(l)]= all_data['target_previous_1'] - all_data['target_previous_' + str(l)]
    all_data['target_trend_' + str(l)].fillna(-1)
name= 'target_trend_2'

all_data= feature_lags(all_data, time_col, inx_cols, name, [11, 12, 13], fill_c)

print('target trends')
print('all_data shape', all_data.shape)

#Trying to estimate
all_data['tar11']= all_data['target_previous_1'] * (1 + all_data['target_trend_2_previous_11'] / all_data['target_previous_12']).replace([np.inf, -np.inf, np.nan], 1)
all_data['tar12']= all_data['target_previous_1'] * (1 + all_data['target_trend_2_previous_12'] / all_data['target_previous_12']).replace([np.inf, -np.inf, np.nan], 1)
all_data['tar13']= all_data['target_previous_1'] * (1 + all_data['target_trend_2_previous_13'] / all_data['target_previous_12']).replace([np.inf, -np.inf, np.nan], 1)
all_data['tar111']= all_data['target_previous_1'] + all_data['target_trend_2_previous_11']
all_data['tar112']= all_data['target_previous_1'] + all_data['target_trend_2_previous_12']
all_data['tar113']= all_data['target_previous_1'] + all_data['target_trend_2_previous_13']
    
print('estimate')
print('all_data shape: ', all_data.shape)
    

In [None]:
all_data = downcast_dtypes(all_data)
print('all_data shape: ', all_data.shape);
print(all_data.info())

In [None]:
all_data.to_csv('all_data_1.csv', index=False);

# Advanced features
#### * Selecting data from month 14
#### * Split between data for modeling and data for submission.
#### * Extract RF leaves.

In [2]:
name= 'all_data_1'
all_data= pd.read_csv(name + '.csv')

In [3]:
all_data.columns

Index(['ID', 'date_block_num', 'item_id', 'shop_id', 'city_code',
       'item_category_id', 'type_code', 'subtype_code', 'target', 'month',
       'year', 'first_shop_item', 'delta_first_shop_item', 'last_shop_item',
       'delta_last_shop_item', 'first_shop', 'delta_first_shop', 'last_shop',
       'delta_last_shop', 'first_item', 'delta_first_item', 'last_item',
       'delta_last_item', 'item_price_previous_1', 'item_price_previous_2',
       'item_price_previous_3', 'item_price_previous_4',
       'item_price_previous_6', 'item_price_previous_9',
       'item_price_previous_12', 'item_price_trend_2', 'item_price_trend_3',
       'item_price_trend_4', 'item_price_trend_6', 'item_price_trend_9',
       'item_price_trend_12', 'days', 'w_days', 'h_days', 'shop_id_mean_skf',
       'item_id_mean_skf', 'item_category_id_mean_skf', 'type_code_mean_skf',
       'subtype_code_mean_skf', 'city_code_mean_skf', 'target_shop_item_mean',
       'target_shop_city_mean', 'target_shop_cat_mean',


In [4]:
cut= 14
train_data= all_data[all_data['ID']==-1]

y= train_data[['target', 'date_block_num']][train_data['date_block_num'] >= cut]

X= train_data.drop(['ID', 'target'], axis=1)
X= X[X['date_block_num'] >= cut]
dbn= X['date_block_num']
X= X.drop('date_block_num', axis=1)


test_data=  all_data[all_data['ID'] > -1].drop(['target', 'date_block_num'], axis=1)
ID= test_data['ID']
test_data.drop('ID', axis=1, inplace=True)

del all_data

In [13]:
X.to_csv('X_data.csv', index=False);
y.to_csv('y_data.csv', header='y', index=False);
#test_data.to_csv('test_data.csv', index=False);
#dbn.to_csv('date_block_num.csv', header='dbn',index=False);

In [6]:
# RF features
RFmodel = RandomForestRegressor(max_depth=17, min_samples_split=1000, random_state=0,n_estimators=3)
RFmodel.fit(X, y)

a= RFmodel.apply(X)
b= RFmodel.apply(test_data)

RFX= pd.DataFrame(data=a, columns=['RF1', 'RF2', 'RF3'])
RFtest= pd.DataFrame(data=b, columns=['RF1', 'RF2', 'RF3'])
print('RF')
#group_cols= ['shop_id', 'item_id', 'month']
#test_data= pd.merge(test_data, X[group_cols + ['RF1', 'RF2', 'RF3']].drop_duplicates(), how='left', on=group_cols).fillna(0)

RF


In [None]:
RFX.to_csv('RFX.csv')
RFtest.to_csv('RFtest.csv')

In [8]:
cols= X.columns.tolist()
scaler = MinMaxScaler()
#scaler = StandardScaler()
XS= scaler.fit_transform(X)
test_dataS= scaler.transform(test_data)

scaler1 = MinMaxScaler(feature_range=(0, 1))
yS= scaler1.fit_transform(y.values.reshape(-1, 1))


XS= pd.DataFrame(data=XS, columns=cols)
test_dataS= pd.DataFrame(data=test_dataS, columns=cols)
yS= pd.DataFrame(data=yS, columns=['target'])

  return self.partial_fit(X, y)


In [14]:
XS.to_csv('XS_data.csv', index=False);
yS.to_csv('yS_data.csv', header='target', index=False);
test_dataS.to_csv('test_data.csv', index=False);

In [None]:
KNNmodel= KNeighborsRegressor(n_neighbors=5)
KNNmodel.fit(XS, y)
distX= KNNmodel.kneighbors(XS)
dist_test= KNNmodel.kneighbors(test_dataS)

knncols=[]
for i in distX.shape[1]:
    knncols.append('knn5' + str(i))
    
distX= pd.DataFrame(data=distX, columns= knncols)
dist_test= pd.DataFrame(data=dist_test, columns=knncols)
print('knn')

In [None]:
distX.to_csv('distX.csv')
dist_test.to_csv('dist_test.csv')

# Pre-Training
#### * Split between train and validation data.
#### * Modeling with Light Gradient Boosting.

In [3]:
X= pd.read_csv('X_data.csv');
y= pd.read_csv('y_data.csv');
test_data= pd.read_csv('test_data.csv');
dbn= pd.read_csv('date_block_num.csv')
RFX= pd.read_csv('RFX.csv')
RFtest= pd.read_csv('RFtest.csv')
#distX= pd.read_csv('distX.csv')
#dist_test= pd.read_csv('dist_test.csv')

In [None]:
X= pd.read_csv('X_data.csv');
y= pd.read_csv('y_data.csv');
test_data= pd.read_csv('test_data.csv');

In [None]:
Xrf= pd.concat([X, RFX], axis=1)
test_data_rf= pd.concat([test_data, RFtest], axis=1)

In [5]:
def ens_models(X_train, y_train, X_valid):
# 3 LGBoost trees

    tlevel1= []; vlevel1= []
    print('lgb')
    lgb_params1 = {'feature_fraction': 0.7,'metric': 'rmse', 'nthread':12, 'min_data_in_leaf': 2**15, 
                      'bagging_fraction': 0.7, 'learning_rate': 0.05, 'objective': 'rmse',
                      'bagging_seed': 2**9, 'num_leaves': 2**9,'bagging_freq':1,'verbose':0}

    lgb_params2 = {'feature_fraction': 0.7,'metric': 'rmse', 'nthread':12, 'min_data_in_leaf': 2**11, 
                  'bagging_fraction': 0.7, 'learning_rate': 0.07, 'objective': 'rmse',
                  'bagging_seed': 2**7, 'num_leaves': 2**9,'bagging_freq':1,'verbose':0}

    lgb_params3 = {'feature_fraction': 0.7,'metric': 'rmse', 'nthread':12, 'min_data_in_leaf': 2**7, 
                  'bagging_fraction': 0.7, 'learning_rate': 0.05, 'objective': 'rmse',
                  'bagging_seed': 2**5, 'num_leaves': 2**11,'bagging_freq':1,'verbose':0}

    for n, lgb_params in enumerate ([lgb_params1, lgb_params2, lgb_params3]):
        model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 30)
        #ax = lgb.plot_importance(model, figsize=(15, 15));plt.show()
        
        t_p= model.predict(X_train)#.clip(0,20);
        tlevel1.append(t_p)
        v_p= model.predict(X_valid)#.clip(0,20);
        vlevel1.append(v_p)
    
    print('RF')
    RF1= RandomForestRegressor(max_depth= 3, min_samples_split= 1000, min_samples_leaf= 50, random_state= 0, n_estimators= 10)
    #RF2= RandomForestRegressor(max_depth= 4, min_samples_split= 1000, min_samples_leaf= 50, random_state= 0, n_estimators= 50)
    #RF3= RandomForestRegressor(max_depth= 5, min_samples_split= 1000, min_samples_leaf= 50, random_state= 0, n_estimators= 50)
    
    for n, RFmodel in enumerate ([RF1]): #, RF2, RF3]):
        RFmodel.fit(X_train, y_train)
     
    
        t_p= model.predict(X_train)#.clip(0,20);
        tlevel1.append(t_p)
        v_p= RFmodel.predict(X_valid)#.clip(0,20);
        vlevel1.append(v_p)
    
    print('LR')    
    LRmodel= LinearRegression()
    Lmodel= Lasso()
    Rmodel= Ridge()
    BRmodel= BayesianRidge()
    for n, model in enumerate ([LRmodel, Lmodel, Rmodel, BRmodel]):
        model.fit(X_train, y_train)
    
        t_p= model.predict(X_train)#.clip(0,20);
        tlevel1.append(t_p.squeeze())
        v_p= model.predict(X_valid)#.clip(0,20);
        vlevel1.append(v_p.squeeze())
    
    return vlevel1, tlevel1

In [55]:
XS.columns

Index(['item_id', 'shop_id', 'city_code', 'item_category_id', 'type_code',
       'subtype_code', 'month', 'year', 'first_shop_item',
       'delta_first_shop_item', 'last_shop_item', 'delta_last_shop_item',
       'first_shop', 'delta_first_shop', 'last_shop', 'delta_last_shop',
       'first_item', 'delta_first_item', 'last_item', 'delta_last_item',
       'item_price_previous_1', 'item_price_previous_2',
       'item_price_previous_3', 'item_price_previous_4',
       'item_price_previous_6', 'item_price_previous_9',
       'item_price_previous_12', 'item_price_trend_2', 'item_price_trend_3',
       'item_price_trend_4', 'item_price_trend_6', 'item_price_trend_9',
       'item_price_trend_12', 'days', 'w_days', 'h_days', 'shop_id_mean_skf',
       'item_id_mean_skf', 'item_category_id_mean_skf', 'type_code_mean_skf',
       'subtype_code_mean_skf', 'city_code_mean_skf', 'target_shop_item_mean',
       'target_shop_city_mean', 'target_shop_cat_mean',
       'target_shop_type_mean', 't

In [162]:
cols= ['target_shop_item_mean', 'target_shop_city_mean', 'target_shop_cat_mean','target_shop_type_mean', 
       'target_shop_subtype_mean',]
X_modeling= XS.drop(cols, axis=1)
test_modeling= test_dataS.drop(cols, axis=1)

In [None]:
predicted_train= XS
predicted_train['date_bulk']= dbn.values.squeeze()
predicted_test= test_dataS
yy= y.copy()
yy.columns= ['target', 'date_bulk']
#yy['date_bulk']= dbn.values.squeeze()
last_bulk= predicted_train['date_bulk'].max()

for n, k in enumerate ([ 30, last_bulk-1, last_bulk]): #[30, last_bulk-1, last_bulk]
    print(k)
    XX= predicted_train.copy()
    df_test= predicted_test.copy()

    predicted_train= pd.DataFrame()
    #k= 24    
    for m in range(k, last_bulk + 1):
        print(m)
        chunk= np.arange(XX['date_bulk'].min(), m + 1)
        X_train= XX[XX['date_bulk'].isin(chunk)]
        y_train= yy[yy['date_bulk'].isin(chunk)]

        if m < last_bulk:
            X_valid= XX[XX['date_bulk']== m + 1]

            vlevel1, _= ens_models(X_train.drop('date_bulk', axis=1), y_train.drop('date_bulk', axis=1), X_valid.drop('date_bulk', axis=1))

            df= pd.DataFrame(data= np.array(vlevel1).T, columns= np.arange(1, len(vlevel1) + 1))
            df['date_bulk']= m + 1

            predicted_train= pd.concat([predicted_train, df], axis=0)
        else:
            X_valid= df_test

            vlevel1, tlevel1= ens_models(X_train.drop('date_bulk', axis=1), y_train.drop('date_bulk', axis=1), X_valid)
            vlevel1= np.array(vlevel1).T
            tlevel1= np.array(tlevel1).T
            predicted_test= pd.DataFrame(data= vlevel1, columns= np.arange(1, vlevel1.shape[1] + 1))


BRmodel1= BayesianRidge()
BRmodel1.fit(tlevel1, y_train.drop('date_bulk', axis=1))
pr1= BRmodel1.predict(tlevel1).clip(0,20)
pr2= BRmodel1.predict(vlevel1).clip(0,20)
            
print('dbn 34 before final level (mean) rmse: ', np.sqrt(mean_squared_error(y['target'][yy['date_bulk']==34], df.drop('date_bulk', axis=1).mean(axis=1).clip(0,20))));
print('dbn 34 before final level rmse: ', np.sqrt(mean_squared_error(y['target'][yy['date_bulk']==34], df[4].clip(0,20))));

print('all modeling data rmse: ', np.sqrt(mean_squared_error(y['target'].iloc[-len(pr1):], pr1)));
print('dbn 34 rmse: ', np.sqrt(mean_squared_error(y['target'][yy['date_bulk']==34], pr1[-len(y['target'][yy['date_bulk']==34]):])));

30
30
lgb
RF




LR


  linalg.lstsq(X, y)
  y = column_or_1d(y, warn=True)


31
lgb
RF




LR


  y = column_or_1d(y, warn=True)


32
lgb
RF




LR


  y = column_or_1d(y, warn=True)


33
lgb
RF




LR


  y = column_or_1d(y, warn=True)


34
lgb
RF




In [171]:
d= df.copy()
d['target']= y['target'][yy['date_bulk']==34].values
d.head(40)

Unnamed: 0,1,2,3,4,5,6,7,8,date_bulk,target
0,0.080975,0.040767,0.078555,0.101167,-0.044547,0.368733,-0.046647,-0.04469,34,0
1,0.083267,0.046821,0.079323,0.101167,-0.019245,0.368733,-0.021498,-0.019399,34,0
2,1.641245,2.802875,2.573291,2.927199,3.006155,0.368733,3.009848,3.006432,34,3
3,1.641245,6.679131,13.174539,13.652454,17.372078,0.368733,17.430754,17.376337,34,20
4,1.641245,6.679131,13.174539,13.652454,17.522201,0.368733,17.579438,17.526357,34,20
5,0.080708,0.041061,0.080151,0.101167,-0.047377,0.368733,-0.049513,-0.047522,34,0
6,0.082556,0.043852,0.084194,0.101167,-0.034273,0.368733,-0.036279,-0.034409,34,0
7,1.641245,1.961071,2.315777,1.847836,2.177071,0.368733,2.168458,2.17646,34,1
8,0.107501,0.059441,0.082449,0.101167,-0.015625,0.368733,-0.017457,-0.015749,34,0
9,1.641245,6.679131,13.174539,13.652454,17.663673,0.368733,17.698487,17.666207,34,20


In [172]:
predicted_test.head(20)

Unnamed: 0,1,2,3,4,5,6,7,8
0,0.817708,0.682206,0.651135,0.710525,0.97011,0.326568,0.840195,0.840341
1,0.817708,0.494642,0.498563,0.448047,0.524902,0.326568,0.387973,0.388181
2,0.817708,1.306023,1.35039,1.507322,1.474712,0.326568,1.35396,1.353683
3,0.817708,0.326817,0.362932,0.274682,0.530305,0.326568,0.392829,0.393039
4,0.817708,4.253409,3.795412,5.688313,5.103971,0.326568,5.008914,5.006646
5,0.817708,0.740783,0.517316,0.774383,0.9478,0.326568,0.822132,0.822397
6,0.817708,0.939559,0.870884,0.942323,1.348728,0.326568,1.152174,1.152647
7,0.181496,0.2018,0.223733,0.088265,0.306953,0.326568,0.177749,0.177936
8,0.817708,0.878358,0.876221,0.774383,1.091692,0.326568,1.057189,1.056474
9,0.817708,0.44102,0.45013,0.40411,0.646515,0.326568,0.502615,0.502851


# Pridiction

In [173]:
test_data['ID']= ID.values
ID= test_data[['ID']];
ID['ID']= ID['ID'].astype(int) 

#ID['item_cnt_month']= predicted_test.mean(axis=1).values;
#ID['item_cnt_month']= predicted_test[2].values; #!!!!!
ID['item_cnt_month']= pr2
ID['item_cnt_month']= ID['item_cnt_month'].clip(0,20)
ID.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,ID,item_cnt_month
2781025,0,0.909075
2781026,1,0.598054
2781027,2,1.79096
2781028,3,0.446586
2781029,4,6.004537
2781030,5,0.851972
2781031,6,1.261256
2781032,7,0.226491
2781033,8,1.015073
2781034,9,0.585622


In [174]:
ID.to_csv('SUBMISSION03MAY_1.csv', index=False)