# NN Embedding using Keras Functional API

Recent researhes are exploring the idea of trying to predict outcome of tabular data using DNN and embeddings. Although it's very common for this approach to be used for image classification or NLP, predicting tabular data has not seen it's fair share of NN approach.

My inspiration to explore this approach came from the following-

- https://arxiv.org/pdf/1703.02596.pdf
- https://arxiv.org/ftp/arxiv/papers/1701/1701.06624.pdf
- https://arxiv.org/pdf/1807.04098.pdf
- https://medium.com/tensorflow/predicting-the-price-of-wine-with-the-keras-functional-api-and-tensorflow-a95d1c2c1b03

In [None]:
#This is my first competition and first kernel in Kaggle.
#Any constructive criticism/ suggestion/ feedback will be appreciated :)
#This was done as part of a Capstone project

In [None]:
import sys
print(sys.version)

In [None]:
#Had additional packages imported as I's experimenting with various methods

%time

import numpy as np
import pandas as pd
import seaborn as sbn
import matplotlib
import matplotlib.pyplot as plt
import os
import json
from pandas.io.json import json_normalize
import datetime
from datetime import datetime
from ast import literal_eval
import copy
import pydot
import warnings
import gc

from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from sklearn import neighbors
from sklearn.exceptions import DataConversionWarning

from keras import models
from keras import optimizers
from keras.models import *
from keras.layers.recurrent import LSTM
from keras.layers import Input, Dense, Activation, Reshape, Concatenate, Flatten, concatenate
from keras.layers.embeddings import Embedding
from keras.layers import Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import plot_model

from bokeh.core.properties import value
from bokeh.io import show, output_file, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool, LinearInterpolator, CategoricalColorMapper
from bokeh.transform import dodge
from bokeh.resources import INLINE
TOOLS = 'crosshair,save,pan,box_zoom,reset,wheel_zoom'
output_notebook()

from plotly import __version__
import plotly.offline
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode()

print('Package import complete')

In [None]:
#File directory to upload or save data
DIR = '../input/'

In [None]:
gc.enable()

features = ['channelGrouping', 'date', 'fullVisitorId', 'visitId',\
       'visitNumber', 'visitStartTime', 'device.browser',\
       'device.deviceCategory', 'device.isMobile', 'device.operatingSystem',\
       'geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country',\
       'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',\
       'geoNetwork.subContinent', 'totals.bounces', 'totals.hits',\
       'totals.newVisits', 'totals.pageviews', 'totals.transactionRevenue',\
       'trafficSource.adContent', 'trafficSource.campaign',\
       'trafficSource.isTrueDirect', 'trafficSource.keyword',\
       'trafficSource.medium', 'trafficSource.referralPath',\
       'trafficSource.source', 'customDimensions']

def load_df(csv_path):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    ans = pd.DataFrame()
    dfs = pd.read_csv(csv_path, sep=',',
            converters={column: json.loads for column in JSON_COLUMNS}, 
            dtype={'fullVisitorId': 'str'}, # Important!!
            chunksize=100000)
    for df in dfs:
        df.reset_index(drop=True, inplace=True)
        for column in JSON_COLUMNS:
            column_as_df = json_normalize(df[column])
            column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
            df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)

        #print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
        use_df = df[features]
        del df
        gc.collect()
        ans = pd.concat([ans, use_df], axis=0).reset_index(drop=True)
        #print(ans.shape)
    return ans

In [None]:
%%time
#Import training data
csvfile=DIR +'train_v2.csv'
df_train=load_df(csvfile)
df_train.shape

In [None]:
%%time
#Import test data
csvfile=DIR +'test_v2.csv'
df_test=load_df(csvfile)
df_test.shape

In [None]:
#Create backup and print dimension of file imported after flattening
train_backup = df_train
test_backup = df_test

print('Training file dimension: ',df_train.shape)
print('Test file dimension: ',df_test.shape)

In [None]:
#Create a function to format datasets
def format_data(df):
     
    #Replace redundant words with NaN
    word_replace=['(not set)', '(none)', '(direct)', '(not provided)','not available in demo dataset', 'unknown.unknown']
    df = df.replace(word_replace, np.nan)
    
    #Replace null values in transactionrevenue column with '0'
    df['totals_transactionRevenue']=df['totals.transactionRevenue'].fillna(0)
    
    #Replace null value in numerical data with '0' and in categorical data with 'Unknown'

    fillna_num = ['totals.bounces',
              'totals.newVisits',
              'totals.pageviews',
              'trafficSource.isTrueDirect']

    fillna_cat = ['trafficSource.keyword',
            'trafficSource.referralPath',
            'trafficSource.adContent',
            'trafficSource.source',
            'trafficSource.medium',
            'device.operatingSystem',
            'geoNetwork.networkDomain',
            'geoNetwork.subContinent',
            'geoNetwork.country',
            'geoNetwork.continent',
            'device.browser']

    for col in fillna_num:
        df[col] = df[col].fillna(0)

    for col in fillna_cat:
        df[col] = df[col].fillna('Unknown')
    
    #Print number and percentage of NaN in the dataset
    count = df.isnull().sum().sort_values(ascending = False)
    percentage = ((df.isnull().sum()/df.isnull().count())*100).sort_values(ascending = False)
    unique = df.nunique()
    missing_data = pd.concat([count, percentage, unique], axis=1, keys=['Count', 'Percentage', 'Unique'], sort=False)
    print('Table showing percentage of missing data: \n',missing_data, '\n')
    
    #Delete columns that have more than 50% data missing
    df = df.drop((missing_data[missing_data['Percentage'] > 50]).index,1)
    
    #Extract feature from "visitStartTime" column, and then add back columns to the dataframe 

    df['Date_time'] = pd.to_datetime(df['visitStartTime'].astype(int), unit='s')
    df['Hour'] = df['Date_time'].dt.hour
    df['Day'] = df['Date_time'].dt.day
    df['Day_of_week'] = df['Date_time'].dt.dayofweek
    df['Month'] = df['Date_time'].dt.month
    df['Week_number'] = df['Date_time'].dt.strftime('%V')
    df['Year'] = df['Date_time'].dt.year
    
    df['Date_time'] = pd.to_datetime(df['Date_time'],format='%Y%m%d %H:%M:%S')

    columns_to_drop = ['date','visitStartTime']

    df = df.drop(columns_to_drop, axis=1)
    
    print('Columns with more than 50% of missing data have been removed \n')
    print('Shape of formatted dataframe: ',df.shape, '\n')
    print('Highest count of null values in formatted dataframe: ', max(df.isnull().sum()))
           
    return df

In [None]:
%%time
#Generate clean training data
df_train = format_data(train_backup)

In [None]:
%%time
#Generate clean test data
df_test = format_data(test_backup)

In [None]:
df_train.to_csv('df_train_clean.csv',header = True, index=False)
print('df_train export complete')

In [None]:
df_test.to_csv('df_test_clean.csv',header = True, index=False)
print('df_test export complete')

In [None]:
#Check if the same columns occur in both dataframes
(df_train.columns.intersection(df_test.columns)).shape

In [None]:
#See what the training data set looks like after formatting
df_train.head()

In [None]:
#See what the test data set looks like after formatting
df_test.head()

In [None]:
#Convert from boolean/object to int

col_to_int = ['device.isMobile', 'Week_number', 'totals.bounces','totals.hits', 'totals.newVisits', 'totals.pageviews']

df_train[col_to_int] = df_train[col_to_int].astype(int)
df_test[col_to_int] = df_test[col_to_int].astype(int)

#Convert totals_transactionRevenue to float for log transform
df_train['totals_transactionRevenue'] = df_train['totals_transactionRevenue'].astype ('float64')
df_test['totals_transactionRevenue'] = df_test['totals_transactionRevenue'].astype ('float64')

#Log transfrom revenue values
df_train['totals_transactionRevenue'] = np.log1p(df_train['totals_transactionRevenue'])
df_test['totals_transactionRevenue'] = np.log1p(df_test['totals_transactionRevenue'])

In [None]:
print(df_train.info())
print(df_test.info())

## Exploratory Data Analysis

In [None]:
#Combine train and test data for exploratory analysis
df_combined = pd.concat([df_train, df_test], axis=0)

#Add a column that would say "1" for revenue contributor, "0" otherwise
df_combined['revenue_contributor'] = np.where(df_combined['totals_transactionRevenue']>0,1,0)

In [None]:
#Set style context for Seaborn
sbn.set_context(font_scale = 2, rc = {"font.size":15, "axes.labelsize":20, "grid.linewidth": 0.1})
sbn.set_style("whitegrid")
mypalette = ["#57606f","#3742fa","#5352ed","#1e90ff","#70a1ff",
             "#2ed573","#ffa502","#ff6348","#ff4757","#ced6e0" ]
color = sbn.set_palette(mypalette)

In [None]:
#Analyze geospatial distribution by visit volume

#Referece: https://www.kaggle.com/arthurtok/generation-unemployed-interactive-plotly-visuals

tmp = df_combined["geoNetwork.country"].value_counts()


colorscale = [[0, 'rgb(102,194,165)'], [0.005, 'rgb(102,194,165)'], 
              [0.01, 'rgb(171,221,164)'], [0.02, 'rgb(230,245,152)'], 
              [0.04, 'rgb(255,255,191)'], [0.05, 'rgb(254,224,139)'], 
              [0.10, 'rgb(253,174,97)'], [0.25, 'rgb(213,62,79)'], [1.0, 'rgb(158,1,66)']]

data = [ dict(
        type = 'choropleth',
        autocolorscale = False,
        colorscale = colorscale,
        showscale = True,
        locations = tmp.index,
        z = tmp.values,
        locationmode = 'country names',
        text = tmp.values,
        marker = dict(
            line = dict(color = '#fff', width = 0.25)) 
            )
       ]

layout = dict(
    height=500,
    title = 'Number of visits by Country',
    geo = dict(
        showframe = True,
        showocean = True,
        oceancolor = 'rgb(28,107,160)',
        projection = dict(
        type = 'orthographic',
            rotation = dict(
                    lon = 60,
                    lat = 10),
        ),
        lonaxis =  dict(
                showgrid = False,
                gridcolor = 'rgb(102, 102, 102)'
            ),
        lataxis = dict(
                showgrid = False,
                gridcolor = 'rgb(102, 102, 102)'
                )
            ),
        )
fig = dict(data=data, layout=layout)
iplot(fig)

#Analyze geospatial distribution by revenue

tmp = df_combined.groupby("geoNetwork.country").agg({"totals_transactionRevenue" : "mean"}).reset_index()


colorscale = [[0, 'rgb(102,194,165)'], [0.005, 'rgb(102,194,165)'], 
              [0.01, 'rgb(171,221,164)'], [0.02, 'rgb(230,245,152)'], 
              [0.04, 'rgb(255,255,191)'], [0.05, 'rgb(254,224,139)'], 
              [0.10, 'rgb(253,174,97)'], [0.25, 'rgb(213,62,79)'], [1.0, 'rgb(158,1,66)']]

data = [ dict(
        type = 'choropleth',
        autocolorscale = False,
        colorscale = colorscale,
        showscale = True,
        locations = tmp['geoNetwork.country'],
        z = tmp['totals_transactionRevenue'],
        locationmode = 'country names',
        text = tmp['totals_transactionRevenue'],
        marker = dict(
            line = dict(color = '#fff', width = 0.25)) 
            )
       ]

layout = dict(
    height=500,
    title = 'Average revenue distribution by Country',
    geo = dict(
        showframe = True,
        showocean = True,
        oceancolor = 'rgb(28,107,160)',
        projection = dict(
        type = 'orthographic',
            rotation = dict(
                    lon = 60,
                    lat = 10),
        ),
        lonaxis =  dict(
                showgrid = False,
                gridcolor = 'rgb(102, 102, 102)'
            ),
        lataxis = dict(
                showgrid = False,
                gridcolor = 'rgb(102, 102, 102)'
                )
            ),
        )
fig = dict(data=data, layout=layout)
iplot(fig)

In [None]:
#Explore revenue distribution accross months over the years
columns = ['Year', 'Month', 'totals_transactionRevenue']
group = df_combined[columns].groupby(['Year', 'Month']).sum().reset_index()

print(group)

plot = sbn.catplot(x = 'Month', y = 'totals_transactionRevenue', hue = 'Year', 
                       palette = ["#3498db","#2ecc71","#f1c40f"], data = group, kind = "bar",
                      height = 6, aspect = 2)

plot.set(xlabel='Month', ylabel='Sum of Log Revenue', title = 'Revenue distribution accross months over the years')

plot

In [None]:
columns3 = ['device.browser','device.isMobile']

group3 = df_combined[columns3].groupby(['device.browser','device.isMobile'])\
                              .aggregate({'device.browser':['count']}).reset_index()

group3.columns = ['device.browser', 'device.isMobile', 'browser.count']
group3 = group3.sort_values(by = 'browser.count', ascending = False).head(10)

print('Table showing the top 10 browsers preferred by users:\n')
print(group3)

#Plot top 10 browsers by usage
plot3 = sbn.catplot(x = 'device.browser', y = 'browser.count', hue = 'device.isMobile',
                       palette = ["#3498db","#f1c40f"], data = group3, kind = "bar",
                      height = 6, aspect = 2)

plot3.set(xlabel='Device Browser', ylabel='Count of usage', title = 'Top 10 browsers preferred by users')

print(plot3)


## Neural Network Model with Embeddings

In [None]:
cat_cols = ['channelGrouping', 'fullVisitorId',
       'device.browser', 'device.deviceCategory',
       'device.operatingSystem', 'geoNetwork.continent', 'geoNetwork.country',
       'geoNetwork.networkDomain', 'geoNetwork.subContinent',
       'trafficSource.adContent', 'trafficSource.isTrueDirect',
       'trafficSource.keyword', 'trafficSource.medium',
       'trafficSource.referralPath', 'trafficSource.source',
       'customDimensions']

cont_cols = ['visitId', 'visitNumber', 'device.isMobile',
       'totals.bounces', 'totals.hits', 'totals.newVisits', 'totals.pageviews',
       'Hour', 'Day','Day_of_week', 'Month', 'Week_number', 'Year']

target = ['totals_transactionRevenue']

print('Number of categorical variable columns = '+str(len(cat_cols)))
print('Number of continuous variable columns = '+str(len(cont_cols)))

In [None]:
#Normalize continuous variables

warnings.filterwarnings(action='ignore', category=DataConversionWarning)

df_train_cont = df_train[cont_cols].values
min_max_scaler = preprocessing.MinMaxScaler()
df_train_cont_scaled = min_max_scaler.fit_transform(df_train_cont)
df_train_cont = pd.DataFrame(df_train_cont_scaled, columns = cont_cols)

df_test_cont = df_test[cont_cols].values
min_max_scaler = preprocessing.MinMaxScaler()
df_test_cont_scaled = min_max_scaler.fit_transform(df_test_cont)
df_test_cont = pd.DataFrame(df_test_cont_scaled, columns = cont_cols)

In [None]:
#Define function for label encoding

def data_le(df):
    
    les = []
    les_num_classes = []

    print('Number of classes in the dataframe: \n')
    
    for i in range(len(cat_cols)):

        encoder = LabelEncoder()
        encoder.fit(df[cat_cols[i]])
        encoded_column = encoder.transform(df[cat_cols[i]])
        les.append(encoded_column)
        num_classes = np.max(les[i])+1
        les_num_classes.append(num_classes)
    
        print('Column '+str(cat_cols[i])+' number of classes -->'+str(num_classes))
    
    return les, les_num_classes #returns array of label encoded data and number classes for each column of data

In [None]:
df_train_cat, les_num_classes_train = data_le(df_train)

df_train_cat = pd.DataFrame(df_train_cat).astype('int32').transpose() #restore shape of array after label encoding
df_train_cat.columns = cat_cols

les_num_classes_train = pd.DataFrame(les_num_classes_train, columns = ['Num_classes']).astype('int32')

In [None]:
df_test_cat, les_num_classes_test = data_le(df_test)

df_test_cat = pd.DataFrame(df_test_cat).astype('int32').transpose()
df_test_cat.columns = cat_cols
les_num_classes_test = pd.DataFrame(les_num_classes_test, columns = ['Num_classes']).astype('int32')

In [None]:
#Prepare input dimension to be used in the embedding layer
num_classes = pd.concat([les_num_classes_train, les_num_classes_test], axis = 1)
input_dimension = np.array(num_classes.max(axis = 1)+1)
input_dimension

In [None]:
#Train test split

train_size = int(len(df_train) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(df_test)))
print ("Validation size: %d" % (len(df_train) - train_size))

In [None]:
X_train_cont = df_train_cont.iloc[:train_size]
X_train_cat = df_train_cat.iloc[:train_size]
y_train = df_train[target][:train_size]

X_val_cont = df_train_cont.iloc[train_size:]
X_val_cat = df_train_cat.iloc[train_size:]
y_val = df_train[target][train_size:]

X_test_cont = df_test_cont
X_test_cat = df_test_cat
y_test = df_test[target]

print('Shape of X_train_cont: '+str(X_train_cont.shape))
print('Shape of X_train_cat: '+str(X_train_cat.shape))
print('Shape of y_train: '+str(y_train.shape), '\n')

print('Shape of X_val_cont: '+str(X_val_cont.shape))
print('Shape of X_val_cat: '+str(X_val_cat.shape))
print('Shape of y_val: '+str(y_val.shape), '\n')

print('Shape of X_test_cont: '+str(X_test_cont.shape))
print('Shape of X_test_cat: '+str(X_test_cat.shape))
print('Shape of y_test: '+str(y_test.shape), '\n')

In [None]:
#Create the model

#Define input layer

#Input for the non-embedded layer
input_cont = Input(shape=(13,)) #since input is n X 16 matrix after min-max scalar transformation

#Input for embedding layer
input_cat1 = Input(shape=(1,))
input_cat2 = Input(shape=(1,))
input_cat3 = Input(shape=(1,))
input_cat4 = Input(shape=(1,))
input_cat5 = Input(shape=(1,))
input_cat6 = Input(shape=(1,))
input_cat7 = Input(shape=(1,))
input_cat8 = Input(shape=(1,))
input_cat9 = Input(shape=(1,))
input_cat10 = Input(shape=(1,))
input_cat11 = Input(shape=(1,))
input_cat12 = Input(shape=(1,))
input_cat13 = Input(shape=(1,))
input_cat14 = Input(shape=(1,))
input_cat15 = Input(shape=(1,))
input_cat16 = Input(shape=(1,))

deep_inputs = [input_cont,
              input_cat1,
              input_cat2,
              input_cat3,
              input_cat4,
              input_cat5,
              input_cat6,
              input_cat7,
              input_cat8,
              input_cat9,
              input_cat10,
              input_cat11,
              input_cat12,
              input_cat13,
              input_cat14,
              input_cat15,
              input_cat16,]

#Define embedding layer
embed1 = Embedding(output_dim = 1, input_dim = input_dimension[0], input_length = 1)(input_cat1)
embed2 = Embedding(output_dim = 1, input_dim = input_dimension[1], input_length = 1)(input_cat2)
embed3 = Embedding(output_dim = 1, input_dim = input_dimension[2], input_length = 1)(input_cat3)
embed4 = Embedding(output_dim = 1, input_dim = input_dimension[3], input_length = 1)(input_cat4)
embed5 = Embedding(output_dim = 1, input_dim = input_dimension[4], input_length = 1)(input_cat5)
embed6 = Embedding(output_dim = 1, input_dim = input_dimension[5], input_length = 1)(input_cat6)
embed7 = Embedding(output_dim = 1, input_dim = input_dimension[6], input_length = 1)(input_cat7)
embed8 = Embedding(output_dim = 1, input_dim = input_dimension[7], input_length = 1)(input_cat8)
embed9 = Embedding(output_dim = 1, input_dim = input_dimension[8], input_length = 1)(input_cat9)
embed10 = Embedding(output_dim = 1, input_dim = input_dimension[9], input_length = 1)(input_cat10)
embed11 = Embedding(output_dim = 1, input_dim = input_dimension[10], input_length = 1)(input_cat11)
embed12 = Embedding(output_dim = 1, input_dim = input_dimension[11], input_length = 1)(input_cat12)
embed13 = Embedding(output_dim = 1, input_dim = input_dimension[12], input_length = 1)(input_cat13)
embed14 = Embedding(output_dim = 1, input_dim = input_dimension[13], input_length = 1)(input_cat14)
embed15 = Embedding(output_dim = 1, input_dim = input_dimension[14], input_length = 1)(input_cat15)
embed16 = Embedding(output_dim = 1, input_dim = input_dimension[15], input_length = 1)(input_cat16)

#Flatten embedding layer
embed1 = Flatten()(embed1)
embed2 = Flatten()(embed2)
embed3 = Flatten()(embed3)
embed4 = Flatten()(embed4)
embed5 = Flatten()(embed5)
embed6 = Flatten()(embed6)
embed7 = Flatten()(embed7)
embed8 = Flatten()(embed8)
embed9 = Flatten()(embed9)
embed10 = Flatten()(embed10)
embed11 = Flatten()(embed11)
embed12 = Flatten()(embed12)
embed13 = Flatten()(embed13)
embed14 = Flatten()(embed14)
embed15 = Flatten()(embed15)
embed16 = Flatten()(embed16)

input_cont = Dense(500, activation = 'relu')(input_cont)
input_cont = Dense(1, activation = 'linear')(input_cont)

#Merge embedded layer and input_cont
merged_layer = concatenate([input_cont,
                           embed1,
                           embed2,
                           embed3,
                           embed4,
                           embed5,
                           embed6,
                           embed7,
                           embed8,
                           embed9,
                           embed10,
                           embed11,
                           embed12,
                           embed13,
                           embed14,
                           embed15,
                           embed16])

#Define hidden layer
hidden1 = Dense(1000, activation = 'relu')(merged_layer)
hidden1 = Dropout(0.2)(hidden1)
hidden2 = Dense(500, activation = 'relu')(hidden1)

#Define output layer
output = Dense(1, activation = 'linear')(hidden2)

#Define model
nnembedding_model = Model(inputs = deep_inputs, outputs = output)

#Compile model
nnembedding_model.compile(loss='mse',
                       optimizer='adam',
                       metrics=['accuracy'])

# Checkpoint
filepath="weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
nnembedding_model.summary()

In [None]:
#Train the model

nnembedding_model.fit([X_train_cont,
                                X_train_cat['channelGrouping'],
                                X_train_cat['fullVisitorId'],
                                X_train_cat['device.browser'],
                                X_train_cat['device.deviceCategory'],
                                X_train_cat['device.operatingSystem'],
                                X_train_cat['geoNetwork.continent'],
                                X_train_cat['geoNetwork.country'],
                                X_train_cat['geoNetwork.networkDomain'],
                                X_train_cat['geoNetwork.subContinent'],
                                X_train_cat['trafficSource.adContent'],
                                X_train_cat['trafficSource.isTrueDirect'],
                                X_train_cat['trafficSource.keyword'],
                                X_train_cat['trafficSource.medium'],
                                X_train_cat['trafficSource.referralPath'],
                                X_train_cat['trafficSource.source'],
                                X_train_cat['customDimensions']],
                                y_train,
                                epochs=10, batch_size=128, verbose = 1, callbacks=callbacks_list,
                                validation_data = ([X_val_cont,
                                X_val_cat['channelGrouping'],
                                X_val_cat['fullVisitorId'],
                                X_val_cat['device.browser'],
                                X_val_cat['device.deviceCategory'],
                                X_val_cat['device.operatingSystem'],
                                X_val_cat['geoNetwork.continent'],
                                X_val_cat['geoNetwork.country'],
                                X_val_cat['geoNetwork.networkDomain'],
                                X_val_cat['geoNetwork.subContinent'],
                                X_val_cat['trafficSource.adContent'],
                                X_val_cat['trafficSource.isTrueDirect'],
                                X_val_cat['trafficSource.keyword'],
                                X_val_cat['trafficSource.medium'],
                                X_val_cat['trafficSource.referralPath'],
                                X_val_cat['trafficSource.source'],
                                X_val_cat['customDimensions']],
                                                   y_val))

In [None]:
nnembedding_model.load_weights("weights.best.hdf5")

df_val = y_val.copy()

df_val['pred'] = nnembedding_model.predict([X_val_cont,
                                X_val_cat['channelGrouping'],
                                X_val_cat['fullVisitorId'],
                                X_val_cat['device.browser'],
                                X_val_cat['device.deviceCategory'],
                                X_val_cat['device.operatingSystem'],
                                X_val_cat['geoNetwork.continent'],
                                X_val_cat['geoNetwork.country'],
                                X_val_cat['geoNetwork.networkDomain'],
                                X_val_cat['geoNetwork.subContinent'],
                                X_val_cat['trafficSource.adContent'],
                                X_val_cat['trafficSource.isTrueDirect'],
                                X_val_cat['trafficSource.keyword'],
                                X_val_cat['trafficSource.medium'],
                                X_val_cat['trafficSource.referralPath'],
                                X_val_cat['trafficSource.source'],
                                X_val_cat['customDimensions']])

df_val['diff'] = df_val['pred'] - df_val['totals_transactionRevenue']
df_val['abs_diff'] = np.abs(df_val['diff'])
df_val['abs_perc_diff'] = (df_val['abs_diff']/df_val['totals_transactionRevenue'])*100
RMSE = ((df_val['diff'] ** 2).mean()) ** .5

print('RMSE for validation set: '+str(RMSE))

In [None]:
nnembedding_model.load_weights("weights.best.hdf5")

df_test_copy = y_test.copy()

df_test_copy['pred'] = nnembedding_model.predict([X_test_cont,
                                X_test_cat['channelGrouping'],
                                X_test_cat['fullVisitorId'],
                                X_test_cat['device.browser'],
                                X_test_cat['device.deviceCategory'],
                                X_test_cat['device.operatingSystem'],
                                X_test_cat['geoNetwork.continent'],
                                X_test_cat['geoNetwork.country'],
                                X_test_cat['geoNetwork.networkDomain'],
                                X_test_cat['geoNetwork.subContinent'],
                                X_test_cat['trafficSource.adContent'],
                                X_test_cat['trafficSource.isTrueDirect'],
                                X_test_cat['trafficSource.keyword'],
                                X_test_cat['trafficSource.medium'],
                                X_test_cat['trafficSource.referralPath'],
                                X_test_cat['trafficSource.source'],
                                X_test_cat['customDimensions']])

df_test_copy['diff'] = df_test_copy['pred'] - df_test_copy['totals_transactionRevenue']
df_test_copy['abs_diff'] = np.abs(df_test_copy['diff'])
df_test_copy['abs_perc_diff'] = (df_test_copy['abs_diff']/df_test_copy['totals_transactionRevenue'])*100
RMSE = ((df_test_copy['diff'] ** 2).mean()) ** .5

print('RMSE for test set: '+str(RMSE))

In [None]:
nnembedding_model.load_weights("weights.best.hdf5")

cvscores = []

scores = nnembedding_model.evaluate([X_test_cont,
                                X_test_cat['channelGrouping'],
                                X_test_cat['fullVisitorId'],
                                X_test_cat['device.browser'],
                                X_test_cat['device.deviceCategory'],
                                X_test_cat['device.operatingSystem'],
                                X_test_cat['geoNetwork.continent'],
                                X_test_cat['geoNetwork.country'],
                                X_test_cat['geoNetwork.networkDomain'],
                                X_test_cat['geoNetwork.subContinent'],
                                X_test_cat['trafficSource.adContent'],
                                X_test_cat['trafficSource.isTrueDirect'],
                                X_test_cat['trafficSource.keyword'],
                                X_test_cat['trafficSource.medium'],
                                X_test_cat['trafficSource.referralPath'],
                                X_test_cat['trafficSource.source'],
                                X_test_cat['customDimensions']],
                        y_test, verbose=1)

print("%s: %.2f%%" % (nnembedding_model.metrics_names[1], scores[1]*100))
cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

In [None]:
#Prepare file for submission
submission = df_test[['fullVisitorId']].copy()
submission['PredictedLogRevenue'] = df_test_copy['pred']
submission = submission[['fullVisitorId', 'PredictedLogRevenue']].groupby('fullVisitorId').sum().reset_index()
submission.to_csv('Shujon_submission.csv',header = True, index=False)
submission.shape

## Random Forest Regressor Model

In [None]:
df_train_rf = pd.concat([df_train_cat, df_train_cont], axis = 1, sort = False)
df_test_rf = pd.concat([df_test_cat, df_test_cont], axis = 1, sort = False)

In [None]:
X_train_rf = df_train_rf.iloc[:train_size]
y_train_rf = df_train[target][:train_size]

X_val_rf = df_train_rf.iloc[train_size:]
y_val_rf = df_train[target][train_size:]

X_test_rf = df_test_rf
y_test_rf = df_test[target]


print('Shape of X_train_rf: '+str(X_train_rf.shape))
print('Shape of y_train_rf: '+str(y_train_rf.shape), '\n')

print('Shape of X_val_rf: '+str(X_val_rf.shape))
print('Shape of y_val_rf: '+str(y_val_rf.shape), '\n')

print('Shape of X_test_rf: '+str(X_test_rf.shape))
print('Shape of y_test_rf: '+str(y_test_rf.shape), '\n')

In [None]:
%%time

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42, verbose = 2)

# Train the model on training data
rf.fit(X_train_rf, y_train_rf)

In [None]:
# Use the forest's predict method on the test data
predictions_rf = rf.predict(X_test_rf)
predictions_rf = pd.DataFrame(predictions_rf, columns = ['Prediction_rf'])

# Calculate the absolute errors
errors_rf = abs(predictions_rf - y_test_rf)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors_rf), 2), 'degrees.')

In [None]:
df_val_rf = y_val_rf.copy()

df_val_rf['pred'] = rf.predict(X_val_rf)

df_val_rf['diff'] = df_val_rf['pred'] - df_val_rf['totals_transactionRevenue']
df_val_rf['abs_diff'] = np.abs(df_val_rf['diff'])
df_val_rf['abs_perc_diff'] = (df_val_rf['abs_diff']/df_val_rf['totals_transactionRevenue'])*100
RMSE = ((df_val_rf['diff'] ** 2).mean()) ** .5

print('RMSE for validation set (Random Forest Regressor): '+str(RMSE))

In [None]:
df_test_rf = y_test_rf.copy()

df_test_rf['pred'] = rf.predict(X_test_rf)

df_test_rf['diff'] = df_test_rf['pred'] - df_test_rf['totals_transactionRevenue']
df_test_rf['abs_diff'] = np.abs(df_test_rf['diff'])
df_test_rf['abs_perc_diff'] = (df_test_rf['abs_diff']/df_test_rf['totals_transactionRevenue'])*100
RMSE = ((df_test_rf['diff'] ** 2).mean()) ** .5

print('RMSE for validation set (Random Forest Regressor): '+str(RMSE))