Take a layer (in the model 3 case, the concatenated layer of embeddings and real inputs) from the trained neural network and use activations as features for an XGBoost model, either with or (in the model 3 case) without the original features.

In [None]:
ON_KAGGLE = True

In [None]:
VALIDATION_START = '2017-03-12'
VALIDATION_DELTA = '42 days'
PRIVATE_START = '2017-03-19'

In [None]:
LAYER_TO_USE = 'cat_layer' # embed='cat_layer', middle='hidden1', top='hidden2'
BOTH = True               # Whether to include raw features in XGB along with NN activations
OUTFILE = 'embed_both_val'      # to use for name of output file with out-of-sample predictions

In [None]:
if ON_KAGGLE:
    NROUNDS = 2000
    LR = .04
    EARLY_STOP = 100
else:
    NROUNDS = 4000
    LR = .015
    EARLY_STOP = 700

In [None]:
SUBSAMPLE = 0.8
COLSAMPLE = .75
MAX_DEPTH = 9
ALPHA = .15
LAMBDA = .05
GAMMA = 1e-3

In [None]:
LOOKBACK = 15
LOOKBACK1 = 57
LOOKBACK2 = 36
LOOKAHEAD = 28

In [None]:
"""
Contributions from:
DSEverything - Mean Mix - Math, Geo, Harmonic (LB 0.493) 
https://www.kaggle.com/dongxu027/mean-mix-math-geo-harmonic-lb-0-493
JdPaletto - Surprised Yet? - Part2 - (LB: 0.503)
https://www.kaggle.com/jdpaletto/surprised-yet-part2-lb-0-503
hklee - weighted mean comparisons, LB 0.497, 1ST
https://www.kaggle.com/zeemeen/weighted-mean-comparisons-lb-0-497-1st
tunguz - Surprise Me 2!
https://www.kaggle.com/tunguz/surprise-me-2/code

Also all comments for changes, encouragement, and forked scripts rock

Keep the Surprise Going
"""

import glob, re
import numpy as np
import pandas as pd
from sklearn import *
from datetime import datetime
from xgboost import XGBRegressor
import xgboost as xgb

from keras.layers import Embedding, Input, Dense, GaussianDropout
import keras
import keras.backend as K
from keras.optimizers import Adam

import matplotlib.pyplot as plt

In [None]:
# Based on Philippe R閙y's Github  https://github.com/philipperemy

def get_activations(model, model_inputs, layer_name=None):
    activations = []
    inp = model.input

    model_multi_inputs_cond = True
    if not isinstance(inp, list):
        # only one input! let's wrap it in a list.
        inp = [inp]
        model_multi_inputs_cond = False

    outputs = [layer.output for layer in model.layers if
               layer.name == layer_name or layer_name is None]  # all layer outputs

    funcs = [K.function(inp + [K.learning_phase()], [out]) for out in outputs]  # evaluation functions

    if model_multi_inputs_cond:
        list_inputs = []
        list_inputs.extend(model_inputs)
        list_inputs.append(0.)
    else:
        list_inputs = [model_inputs, 0.]

    layer_outputs = [func(list_inputs)[0] for func in funcs]
    for layer_activations in layer_outputs:
        activations.append(layer_activations)

    return activations

In [None]:
data = {
    'tra': pd.read_csv('../input/air_visit_data.csv'),
    'as': pd.read_csv('../input/air_store_info.csv'),
    'hs': pd.read_csv('../input/hpg_store_info.csv'),
    'ar': pd.read_csv('../input/air_reserve.csv'),
    'hr': pd.read_csv('../input/hpg_reserve.csv'),
    'id': pd.read_csv('../input/store_id_relation.csv'),
    'tes': pd.read_csv('../input/sample_submission.csv'),
    'hol': pd.read_csv('../input/date_info.csv').rename(columns={'calendar_date':'visit_date'})
    }

data['hr'] = pd.merge(data['hr'], data['id'], how='inner', on=['hpg_store_id'])


In [None]:
# Create n-day mean feature(s)

lookbacks = [LOOKBACK, LOOKBACK1, LOOKBACK2]

train0 = data['tra'].copy()
train0['visitors'] = train0.visitors.apply(np.log1p)
train0['visit_datetime'] = pd.to_datetime(train0['visit_date'])

# Make data frame with one entry fore each restaurant-by-date combination
resta = train0.air_store_id.unique()
means_df = pd.DataFrame()
for d in pd.date_range(start='2016-01-01',end='2017-05-31'):
    r = pd.DataFrame(resta, columns=['air_store_id'])
    r['agg_date'] = d
    means_df = pd.concat([means_df,r], axis=0)

# For each lookback period (which in this case is just one, 35 days)
for n in lookbacks:
    # For each date, calculate each restaurant's trailing average number of visits over n days
    ndays = str(n)
    offset = pd.Timedelta(ndays+' days')
    name = 'mean' + ndays
    df = train0[['air_store_id','visit_datetime','visitors']]
    mean_df = pd.DataFrame()
    for d in pd.date_range(start='2016-01-01',end='2017-05-24'):
        da = d
        if da>pd.to_datetime(VALIDATION_START):
            da = pd.to_datetime(VALIDATION_START)
        the_mean = df[df.visit_datetime.between(da-offset, da)].groupby('air_store_id').mean()
        the_mean['agg_date'] = d
        mean_df = pd.concat([mean_df, the_mean.rename(columns={'visitors':name}).reset_index()], axis=0)
    means_df = means_df.merge(mean_df, on=['air_store_id','agg_date'])

lookforward = LOOKAHEAD
ndays = str(lookforward)
offset = pd.Timedelta(ndays+' days')
means_merge = means_df.copy()
means_merge['visit_date'] = ( means_merge.agg_date + offset ).dt.date
means_merge = means_merge.drop(['agg_date'],axis=1)

In [None]:
for df in ['ar','hr']:
    df_test = df + '_test'
    
    data[df]['visit_datetime'] = pd.to_datetime(data[df]['visit_datetime'])
    data[df]['visit_dow'] = data[df]['visit_datetime'].dt.dayofweek
    data[df]['visit_datetime'] = data[df]['visit_datetime'].dt.date
    data[df]['reserve_datetime'] = pd.to_datetime(data[df]['reserve_datetime'])
    data[df_test] = data[df][data[df]['reserve_datetime']<pd.to_datetime(VALIDATION_START)].copy()

    data[df]['reserve_datetime'] = data[df]['reserve_datetime'].dt.date
    data[df_test]['reserve_datetime'] = data[df_test]['reserve_datetime'].dt.date
    data[df]['reserve_datetime_diff'] = data[df].apply(lambda r: (r['visit_datetime'] - r['reserve_datetime']).days, axis=1)
    data[df_test]['reserve_datetime_diff'] = data[df_test].apply(lambda r: (r['visit_datetime'] - r['reserve_datetime']).days, axis=1)

    # Exclude reservations less than 6 weeks in advance from training set
    data[df] = data[df][data[df]['reserve_datetime_diff'] > data[df]['visit_dow'] + 35]

    tmp1 = data[df].groupby(['air_store_id','visit_datetime'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].sum().rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs1', 'reserve_visitors':'rv1'})
    tmp2 = data[df].groupby(['air_store_id','visit_datetime'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].mean().rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs2', 'reserve_visitors':'rv2'})
    data[df] = pd.merge(tmp1, tmp2, how='inner', on=['air_store_id','visit_date'])
    
    tmp1 = data[df_test].groupby(['air_store_id','visit_datetime'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].sum().rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs1', 'reserve_visitors':'rv1'})
    tmp2 = data[df_test].groupby(['air_store_id','visit_datetime'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].mean().rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs2', 'reserve_visitors':'rv2'})
    data[df_test] = pd.merge(tmp1, tmp2, how='inner', on=['air_store_id','visit_date'])

In [None]:
data['tra']['visit_date'] = pd.to_datetime(data['tra']['visit_date'])
actuals = data['tra'][data['tra']['visit_date'] >= pd.to_datetime(VALIDATION_START)].copy()
actuals['id'] = actuals['air_store_id'] + '_' + actuals['visit_date'].dt.strftime('%Y-%m-%d')

actuals.drop(['air_store_id','visit_date'],axis=1,inplace=True)
actuals.head()

In [None]:
data['tra'] = data['tra'][data['tra']['visit_date'] < pd.to_datetime(VALIDATION_START)]
data['tra']['dow'] = data['tra']['visit_date'].dt.dayofweek
data['tra']['year'] = data['tra']['visit_date'].dt.year
data['tra']['month'] = data['tra']['visit_date'].dt.month
data['tra']['visit_date'] = data['tra']['visit_date'].dt.date

In [None]:

data['tes']['visit_date'] = data['tes']['id'].map(lambda x: str(x).split('_')[2])
data['tes']['air_store_id'] = data['tes']['id'].map(lambda x: '_'.join(x.split('_')[:2]))
# Replace with validation data
data['tes']['visit_date'] = pd.to_datetime(data['tes']['visit_date']) - pd.Timedelta(VALIDATION_DELTA)
is_public = data['tes']['visit_date'] < PRIVATE_START

data['tes']['dow'] = data['tes']['visit_date'].dt.dayofweek
data['tes']['year'] = data['tes']['visit_date'].dt.year
data['tes']['month'] = data['tes']['visit_date'].dt.month
data['tes']['visit_date'] = data['tes']['visit_date'].dt.date
data['tes']['id'] = data['tes']['air_store_id'] + '_' + pd.to_datetime(data['tes']['visit_date']).dt.strftime('%Y-%m-%d')

actuals = data['tes'][['id']].merge(actuals,on='id',how='left')
weights = 1 - 1*actuals['visitors'].isnull()

actuals = actuals.fillna(-0.99)
actuals.head()


In [None]:
public_weights = weights * is_public.values
private_weights = weights * (1-is_public.values)

In [None]:
unique_stores = data['tes']['air_store_id'].unique()
stores = pd.concat([pd.DataFrame({'air_store_id': unique_stores, 'dow': [i]*len(unique_stores)}) for i in range(7)], axis=0, ignore_index=True).reset_index(drop=True)

In [None]:
#sure it can be compressed...
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].min().rename(columns={'visitors':'min_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].mean().rename(columns={'visitors':'mean_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].median().rename(columns={'visitors':'median_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].max().rename(columns={'visitors':'max_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].count().rename(columns={'visitors':'count_observations'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 

# genre-by-area features
genareas = data['as'][['air_store_id','air_area_name','air_genre_name']]
tragear = data['tra'].merge( genareas, on='air_store_id' )
tmp = tragear.groupby(['air_area_name','air_genre_name','dow'], as_index=False)['visitors'].min().rename(columns={'visitors':'min_gear_visitors'})
tmp = genareas.merge( tmp, on=['air_area_name','air_genre_name'] ).drop(['air_area_name','air_genre_name'], axis=1)
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 
tmp = tragear.groupby(['air_area_name','air_genre_name','dow'], as_index=False)['visitors'].mean().rename(columns={'visitors':'mean_gear_visitors'})
tmp = genareas.merge( tmp, on=['air_area_name','air_genre_name'] ).drop(['air_area_name','air_genre_name'], axis=1)
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = tragear.groupby(['air_area_name','air_genre_name','dow'], as_index=False)['visitors'].median().rename(columns={'visitors':'median_gear_visitors'})
tmp = genareas.merge( tmp, on=['air_area_name','air_genre_name'] ).drop(['air_area_name','air_genre_name'], axis=1)
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = tragear.groupby(['air_area_name','air_genre_name','dow'], as_index=False)['visitors'].max().rename(columns={'visitors':'max_gear_visitors'})
tmp = genareas.merge( tmp, on=['air_area_name','air_genre_name'] ).drop(['air_area_name','air_genre_name'], axis=1)
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = tragear.groupby(['air_area_name','air_genre_name','dow'], as_index=False)['visitors'].count().rename(columns={'visitors':'count_gear_observations'})
tmp = genareas.merge( tmp, on=['air_area_name','air_genre_name'] ).drop(['air_area_name','air_genre_name'], axis=1)
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 

stores = pd.merge(stores, data['as'], how='left', on=['air_store_id']) 

In [None]:
# NEW FEATURES FROM Georgii Vyshnia
stores['air_genre_name'] = stores['air_genre_name'].map(lambda x: str(str(x).replace('/',' ')))
stores['air_area_name'] = stores['air_area_name'].map(lambda x: str(str(x).replace('-',' ')))
lbl = preprocessing.LabelEncoder()
for i in range(10):
    stores['air_genre_name'+str(i)] = lbl.fit_transform(stores['air_genre_name'].map(lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ''))
    stores['air_area_name'+str(i)] = lbl.fit_transform(stores['air_area_name'].map(lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ''))
stores['air_genre_name'] = lbl.fit_transform(stores['air_genre_name'])
stores['air_area_name'] = lbl.fit_transform(stores['air_area_name'])

data['hol']['visit_date'] = pd.to_datetime(data['hol']['visit_date'])
data['hol']['day_of_week'] = lbl.fit_transform(data['hol']['day_of_week'])
data['hol']['visit_date'] = data['hol']['visit_date'].dt.date
train = pd.merge(data['tra'], data['hol'], how='left', on=['visit_date']) 
test = pd.merge(data['tes'], data['hol'], how='left', on=['visit_date']) 

train = pd.merge(train, stores, how='inner', on=['air_store_id','dow']) 
test = pd.merge(test, stores, how='left', on=['air_store_id','dow'])

for df in ['ar','hr']:
    train = pd.merge(train, data[df], how='left', on=['air_store_id','visit_date']) 
    test = pd.merge(test, data[df_test], how='left', on=['air_store_id','visit_date'])

train['id'] = train.apply(lambda r: '_'.join([str(r['air_store_id']), str(r['visit_date'])]), axis=1)

train['total_reserv_sum'] = train['rv1_x'] + train['rv1_y']
train['total_reserv_mean'] = (train['rv2_x'] + train['rv2_y']) / 2
train['total_reserv_dt_diff_mean'] = (train['rs2_x'] + train['rs2_y']) / 2

test['total_reserv_sum'] = test['rv1_x'] + test['rv1_y']
test['total_reserv_mean'] = (test['rv2_x'] + test['rv2_y']) / 2
test['total_reserv_dt_diff_mean'] = (test['rs2_x'] + test['rs2_y']) / 2

# NEW FEATURES FROM JMBULL
train['date_int'] = train['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
test['date_int'] = test['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
train['var_max_lat'] = train['latitude'].max() - train['latitude']
train['var_max_long'] = train['longitude'].max() - train['longitude']
test['var_max_lat'] = test['latitude'].max() - test['latitude']
test['var_max_long'] = test['longitude'].max() - test['longitude']

# NEW FEATURES FROM Georgii Vyshnia
train['lon_plus_lat'] = train['longitude'] + train['latitude'] 
test['lon_plus_lat'] = test['longitude'] + test['latitude']

lbl = preprocessing.LabelEncoder()
train['air_store_id2'] = lbl.fit_transform(train['air_store_id'])
test['air_store_id2'] = lbl.transform(test['air_store_id'])

In [None]:
# N-DAY TRAINLING MEANS
train = train.merge(means_merge,on=['air_store_id','visit_date'],how='left')
test = test.merge(means_merge,on=['air_store_id','visit_date'],how='left')

# Golden week feature
week_before = (pd.to_datetime(train.visit_date) >= pd.to_datetime('2016-04-24')) & \
              (pd.to_datetime(train.visit_date) <= pd.to_datetime('2016-04-30'))
week_of     = (pd.to_datetime(train.visit_date) >= pd.to_datetime('2016-05-01')) & \
              (pd.to_datetime(train.visit_date) <= pd.to_datetime('2016-05-07'))
week_after  = (pd.to_datetime(train.visit_date) >= pd.to_datetime('2016-05-08')) & \
              (pd.to_datetime(train.visit_date) <= pd.to_datetime('2016-05-14'))
train['gold'] = 0
train.loc[week_before,'gold'] = 2
train.loc[week_of,    'gold'] = 3
train.loc[week_after, 'gold'] = 1

week_before = (pd.to_datetime(test.visit_date) >= pd.to_datetime('2017-04-23')) & \
              (pd.to_datetime(test.visit_date) <= pd.to_datetime('2017-04-29'))
week_of     = (pd.to_datetime(test.visit_date) >= pd.to_datetime('2017-04-30')) & \
              (pd.to_datetime(test.visit_date) <= pd.to_datetime('2017-05-06'))
week_after  = (pd.to_datetime(test.visit_date) >= pd.to_datetime('2017-05-07')) & \
              (pd.to_datetime(test.visit_date) <= pd.to_datetime('2017-05-13'))
test['gold'] = 0
test.loc[week_before,'gold'] = 2
test.loc[week_of,    'gold'] = 3
test.loc[week_after, 'gold'] = 1

col = [c for c in train if c not in ['id', 'air_store_id', 'visit_date','visitors']]
train = train.fillna(-1)
test = test.fillna(-1)

In [None]:
def RMSLE(y, pred, sample_weight=1):
    if np.size(sample_weight)==1:
        return metrics.mean_squared_error(y, pred)**0.5
    else:
        return metrics.mean_squared_error(y, pred, sample_weight)**0.5

In [None]:
value_col = ['holiday_flg','min_visitors','mean_visitors','median_visitors','max_visitors',
'count_observations', 'min_gear_visitors',  'mean_gear_visitors', 
'median_gear_visitors', 'max_gear_visitors', 'count_gear_observations', 'mean15', 'mean57', 'mean36',
'rs1_x','rv1_x','rs2_x','rv2_x','rs1_y','rv1_y','rs2_y','rv2_y','total_reserv_sum','total_reserv_mean',
'total_reserv_dt_diff_mean','date_int','var_max_lat','var_max_long','lon_plus_lat']

nn_col = value_col + ['dow', 'year', 'month', 'gold', 'air_store_id2', 'air_area_name', 'air_genre_name',
'air_area_name0', 'air_area_name1', 'air_area_name2', 'air_area_name3', 'air_area_name4',
'air_area_name5', 'air_area_name6', 'air_genre_name0', 'air_genre_name1',
'air_genre_name2', 'air_genre_name3', 'air_genre_name4']


X = train.copy()
X_test = test[nn_col].copy()

value_scaler = preprocessing.MinMaxScaler()
for vcol in value_col:
    X[vcol] = value_scaler.fit_transform(X[vcol].values.astype(np.float64).reshape(-1, 1))
    X_test[vcol] = value_scaler.transform(X_test[vcol].values.astype(np.float64).reshape(-1, 1))

X_train = list(X[nn_col].T.as_matrix())
Y_train = np.log1p(X['visitors']).values
nn_train = [X_train, Y_train]
nn_test = [list(X_test[nn_col].T.as_matrix())]
print("Train and test data prepared")

In [None]:
def get_nn_complete_model(train, hidden1_neurons=35, hidden2_neurons=15):
    """
    Input:
        train:           train dataframe(used to define the input size of the embedding layer)
        hidden1_neurons: number of neurons in the first hidden layer
        hidden2_neurons: number of neurons in the first hidden layer
    Output:
        return 'keras neural network model'
    """
    K.clear_session()

    air_store_id = Input(shape=(1,), dtype='int32', name='air_store_id')
    air_store_id_emb = Embedding(len(train['air_store_id2'].unique()) + 1, 15, input_shape=(1,),
                                 name='air_store_id_emb')(air_store_id)
    air_store_id_emb = keras.layers.Flatten(name='air_store_id_emb_flatten')(air_store_id_emb)

    dow = Input(shape=(1,), dtype='int32', name='dow')
    dow_emb = Embedding(8, 3, input_shape=(1,), name='dow_emb')(dow)
    dow_emb = keras.layers.Flatten(name='dow_emb_flatten')(dow_emb)

    gold = Input(shape=(1,), dtype='int32', name='gold')
    gold_emb = Embedding(4, 3, input_shape=(1,), name='gold_emb')(gold)
    gold_emb = keras.layers.Flatten(name='gold_emb_flatten')(gold_emb)

    month = Input(shape=(1,), dtype='int32', name='month')
    month_emb = Embedding(13, 3, input_shape=(1,), name='month_emb')(month)
    month_emb = keras.layers.Flatten(name='month_emb_flatten')(month_emb)

    air_area_name, air_genre_name = [], []
    air_area_name_emb, air_genre_name_emb = [], []
    for i in range(7):
        area_name_col = 'air_area_name' + str(i)
        air_area_name.append(Input(shape=(1,), dtype='int32', name=area_name_col))
        tmp = Embedding(len(train[area_name_col].unique()), 3, input_shape=(1,),
                        name=area_name_col + '_emb')(air_area_name[-1])
        tmp = keras.layers.Flatten(name=area_name_col + '_emb_flatten')(tmp)
        air_area_name_emb.append(tmp)

        if i > 4:
            continue
        area_genre_col = 'air_genre_name' + str(i)
        air_genre_name.append(Input(shape=(1,), dtype='int32', name=area_genre_col))
        tmp = Embedding(len(train[area_genre_col].unique()), 3, input_shape=(1,),
                        name=area_genre_col + '_emb')(air_genre_name[-1])
        tmp = keras.layers.Flatten(name=area_genre_col + '_emb_flatten')(tmp)
        air_genre_name_emb.append(tmp)

    air_genre_name_emb = keras.layers.concatenate(air_genre_name_emb)
    air_genre_name_emb = Dense(4, activation='sigmoid', name='final_air_genre_emb')(air_genre_name_emb)

    air_area_name_emb = keras.layers.concatenate(air_area_name_emb)
    air_area_name_emb = Dense(4, activation='sigmoid', name='final_air_area_emb')(air_area_name_emb)
    
    air_area_code = Input(shape=(1,), dtype='int32', name='air_area_code')
    air_area_code_emb = Embedding(len(train['air_area_name'].unique()), 8, input_shape=(1,), name='air_area_code_emb')(air_area_code)
    air_area_code_emb = keras.layers.Flatten(name='air_area_code_emb_flatten')(air_area_code_emb)
    
    air_genre_code = Input(shape=(1,), dtype='int32', name='air_genre_code')
    air_genre_code_emb = Embedding(len(train['air_genre_name'].unique()), 5, input_shape=(1,),
                                   name='air_genre_code_emb')(air_genre_code)
    air_genre_code_emb = keras.layers.Flatten(name='air_genre_code_emb_flatten')(air_genre_code_emb)

    
    holiday_flg = Input(shape=(1,), dtype='float32', name='holiday_flg')
    year = Input(shape=(1,), dtype='float32', name='year')
    min_visitors = Input(shape=(1,), dtype='float32', name='min_visitors')
    mean_visitors = Input(shape=(1,), dtype='float32', name='mean_visitors')
    median_visitors = Input(shape=(1,), dtype='float32', name='median_visitors')
    max_visitors = Input(shape=(1,), dtype='float32', name='max_visitors')
    count_observations = Input(shape=(1,), dtype='float32', name='count_observations')
    min_gear_visitors = Input(shape=(1,), dtype='float32', name='min_gear_visitors')
    mean_gear_visitors = Input(shape=(1,), dtype='float32', name='mean_gear_visitors')
    median_gear_visitors = Input(shape=(1,), dtype='float32', name='median_gear_visitors')
    max_gear_visitors = Input(shape=(1,), dtype='float32', name='max_gear_visitors')
    count_gear_observations = Input(shape=(1,), dtype='float32', name='count_gear_observations')
    mean15 = Input(shape=(1,), dtype='float32', name='mean15')
    mean57 = Input(shape=(1,), dtype='float32', name='mean57')
    mean36 = Input(shape=(1,), dtype='float32', name='mean36')
    rs1_x = Input(shape=(1,), dtype='float32', name='rs1_x')
    rv1_x = Input(shape=(1,), dtype='float32', name='rv1_x')
    rs2_x = Input(shape=(1,), dtype='float32', name='rs2_x')
    rv2_x = Input(shape=(1,), dtype='float32', name='rv2_x')
    rs1_y = Input(shape=(1,), dtype='float32', name='rs1_y')
    rv1_y = Input(shape=(1,), dtype='float32', name='rv1_y')
    rs2_y = Input(shape=(1,), dtype='float32', name='rs2_y')
    rv2_y = Input(shape=(1,), dtype='float32', name='rv2_y')
    total_reserv_sum = Input(shape=(1,), dtype='float32', name='total_reserv_sum')
    total_reserv_mean = Input(shape=(1,), dtype='float32', name='total_reserv_mean')
    total_reserv_dt_diff_mean = Input(shape=(1,), dtype='float32', name='total_reserv_dt_diff_mean')
    date_int = Input(shape=(1,), dtype='float32', name='date_int')
    var_max_lat = Input(shape=(1,), dtype='float32', name='var_max_lat')
    var_max_long = Input(shape=(1,), dtype='float32', name='var_max_long')
    lon_plus_lat = Input(shape=(1,), dtype='float32', name='lon_plus_lat')

    date_emb = keras.layers.concatenate([dow_emb, month_emb, year, holiday_flg, gold_emb])
    date_emb = Dense(6, activation='sigmoid', name='date_merged_emb')(date_emb)
    
    date_emb = keras.layers.concatenate([dow_emb, month_emb, year, holiday_flg])
    date_emb = Dense(5, activation='sigmoid', name='date_merged_emb')(date_emb)

    cat_layer = keras.layers.concatenate([holiday_flg, min_visitors, mean_visitors,
                    median_visitors, max_visitors, count_observations, min_gear_visitors, 
                    mean_gear_visitors, median_gear_visitors, 
                    max_gear_visitors, count_gear_observations, mean15, mean57, mean36, rs1_x, rv1_x,
                    rs2_x, rv2_x, rs1_y, rv1_y, rs2_y, rv2_y,
                    total_reserv_sum, total_reserv_mean, total_reserv_dt_diff_mean,
                    date_int, var_max_lat, var_max_long, lon_plus_lat,
                    date_emb, air_area_name_emb, air_genre_name_emb,
                    air_area_code_emb, air_genre_code_emb, air_store_id_emb], 
                                         name='cat_layer' )

    drop_layer = keras.layers.GaussianDropout(.3)(cat_layer)
    m = Dense(hidden1_neurons, name='hidden1',
             kernel_initializer=keras.initializers.RandomNormal(mean=0.0,
                            stddev=0.05, seed=None),
             kernel_regularizer=keras.regularizers.l2(1e-5))(drop_layer)
    m = keras.layers.PReLU()(m)
    m = keras.layers.BatchNormalization()(m)
    m = keras.layers.GaussianDropout(.25)(m)
    
    m1 = Dense(hidden2_neurons, name='hidden2',
               kernel_regularizer=keras.regularizers.l2(1e-5))(m)
    m1 = keras.layers.PReLU()(m1)
    m1 = keras.layers.GaussianDropout(.15)(m1)
    m = Dense(1, activation='relu',
              kernel_regularizer=keras.regularizers.l2(1e-5))(m1)

    inp_ten = [
        holiday_flg, min_visitors, mean_visitors, median_visitors, max_visitors, count_observations,
        min_gear_visitors, mean_gear_visitors, median_gear_visitors, max_gear_visitors, 
        count_gear_observations, mean15, mean57, mean36, 
        rs1_x, rv1_x, rs2_x, rv2_x, rs1_y, rv1_y, rs2_y, rv2_y, total_reserv_sum, total_reserv_mean,
        total_reserv_dt_diff_mean, date_int, var_max_lat, var_max_long, lon_plus_lat,
        dow, year, month, gold, air_store_id, air_area_code, air_genre_code
    ]
    inp_ten += air_area_name
    inp_ten += air_genre_name
    model = keras.Model(inp_ten, m)
    model.compile(loss='mse', optimizer=Adam(lr=5e-3, decay=2e-3), metrics=['acc'])

    return model

In [None]:
nn_model = get_nn_complete_model(train, hidden1_neurons=32, hidden2_neurons=11)
X_val = nn_test[0]
y_val = np.log1p(actuals['visitors']).values

nn_model.fit(nn_train[0], nn_train[1], epochs=10, verbose=1,
        batch_size=256, shuffle=True, validation_data=(X_val, y_val, weights) )
nn_model.fit(nn_train[0], nn_train[1], epochs=8, verbose=1,
        batch_size=512, shuffle=True, validation_data=(X_val, y_val, weights) )
nn_model.fit(nn_train[0], nn_train[1], epochs=8, verbose=1,
        batch_size=1024, shuffle=True, validation_data=(X_val, y_val, weights) )

print("NN model trained")

nn_preds = pd.Series(nn_model.predict(nn_train[0]).reshape(-1)).clip(0, 6.8).values

print('\nTraining:')
print('RMSE NeuralNetwork: ', RMSLE(np.log1p(train['visitors'].values), nn_preds))

nn_preds = pd.Series(nn_model.predict(nn_test[0]).reshape(-1)).clip(0, 6.8).values

print('\nFull Validation:')
print('RMSE NeuralNetwork: ', RMSLE(np.log1p(actuals['visitors'].values), nn_preds,
                                               sample_weight=weights.values))

In [None]:
inp = []
for arr in nn_train[0]:
    inp.append( arr.reshape(-1, 1) )
train_act = pd.DataFrame( get_activations(nn_model, inp, layer_name='cat_layer')[0] )
train_act.to_csv('train_activations.csv', index=False)
print(train_act.shape)
train_act.head()

In [None]:
inp = []
for arr in nn_test[0]:
    inp.append( arr.reshape(-1, 1) )
valid_act = pd.DataFrame( get_activations(nn_model, inp, layer_name='cat_layer')[0] )
valid_act.to_csv('valid_activations.csv', index=False)
print(valid_act.shape)
valid_act.head()

In [None]:
y_train = np.log1p(train['visitors'].values)
y_mean = np.mean(y_train)

xgb_params = {
    'eta': LR,
    'max_depth': MAX_DEPTH, 
    'subsample': SUBSAMPLE,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'lambda': LAMBDA,
    'alpha': ALPHA,
    'colsample_bytree': COLSAMPLE,
    'base_score': y_mean,
    'gamma': GAMMA,
    'seed': 3,
    'silent': 1
}


In [None]:
y_valid = np.log1p(actuals['visitors'][weights>0].values)

if not BOTH:
    X_train = train_act
    dtrain = xgb.DMatrix(X_train, y_train)
    X_valid = valid_act[weights>0]
    dvalid = xgb.DMatrix(X_valid, y_valid)
    evals = [(dtrain,'train'),(dvalid,'eval')]
    model = xgb.train(xgb_params, dtrain, num_boost_round=NROUNDS,
                        evals=evals, early_stopping_rounds=EARLY_STOP,
                        verbose_eval=20 )
    
    ntrees = round(model.best_ntree_limit,-2)
    print( ntrees )
    preds = model.predict(dtrain, ntree_limit=ntrees)
    print('\nActivations only:')
    print('Train RMSE XGBRegressor: ', RMSLE(np.log1p(train['visitors'].values), preds))
    X_test = valid_act

In [None]:
if BOTH:
    train_both = pd.concat([train[col],train_act],axis=1)
    valid_both = pd.concat([test[col],valid_act],axis=1)
    X_train = train_both
    dtrain = xgb.DMatrix(X_train, y_train)
    X_valid = valid_both[weights>0]
    dvalid = xgb.DMatrix(X_valid, y_valid)
    evals = [(dtrain,'train'),(dvalid,'eval')]
    model = xgb.train(xgb_params, dtrain, num_boost_round=NROUNDS,
                        evals=evals, early_stopping_rounds=EARLY_STOP,
                        verbose_eval=20 )
    ntrees = round(model.best_ntree_limit,-2)
    print( ntrees )
    preds = model.predict(dtrain, ntree_limit=ntrees)
    print('\nRaw data plus activations:')
    print('Train RMSE XGBRegressor: ', RMSLE(np.log1p(train['visitors'].values), preds))
    X_test = valid_both

In [None]:
dtest = xgb.DMatrix(X_test)
preds = model.predict(dtest, ntree_limit=ntrees)

print('\nFull Validation:')
print('RMSE XGBRegressor: ', RMSLE(np.log1p(actuals['visitors'].values), preds,
                                            sample_weight=weights.values))
print('\nPublic Validation:')
print('RMSE XGBRegressor: ', RMSLE(np.log1p(actuals['visitors'].values), 
                                   preds, sample_weight=public_weights) )
print('\nPrivate Validation:')
print('RMSE XGBRegressor: ', RMSLE(np.log1p(actuals['visitors'].values), 
                                   preds, sample_weight=private_weights) )

test['visitors'] = preds
test['visitors'] = np.expm1(test['visitors']).clip(lower=0.)
sub1 = test[['id','visitors']].copy()

In [None]:
sub_val = sub1[['id', 'visitors']].merge(actuals, on='id', suffixes=['_pred', '_actual'])
sub_val.to_csv(OUTFILE+'.csv', index=False)

In [None]:
sub_val.head()

In [None]:
print('Private RMSE: ', RMSLE( np.log1p(actuals['visitors'].values), 
                               np.log1p(sub1['visitors'].values),
                               sample_weight=private_weights ))