In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../utils')
from dataPiping import *

import numpy as np
import pandas as pd
from math import exp, fabs, sqrt, log, pi
from random import random
import datetime

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.callbacks import Callback, LambdaCallback, TensorBoard, ReduceLROnPlateau, EarlyStopping
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

import seaborn as sns

Using TensorFlow backend.


In [2]:
seed = 42
np.random.seed(seed)

In [3]:
df = pd.read_pickle('../../data/mergedSessionDF.pkl')

In [4]:
trainPeriod = ["2015-02-01", "2016-02-01"]
unixTrainPeriod = list(map(makeunixtime, trainPeriod))

In [5]:
obs_df = df[(df.startUserTime >= makeunixtime(trainPeriod[0])) & (df.startUserTime < makeunixtime(trainPeriod[1]))]

In [6]:
def removeNan(df):
    nan_cust = df[df.returnTime.isnull()].customerId.unique()
    return df[~df.customerId.isin(nan_cust)]

In [7]:
obs_df = removeNan(obs_df)

In [8]:
def _load_data():
    n_sessions = 31
    active_users = obs_df.groupby('customerId').filter(lambda x: len(x) >= n_sessions)
    act_df = active_users.groupby('customerId').tail(n_sessions)
    #     return active_users

    x_df = act_df.groupby('customerId').head(30)
    y_df = act_df.groupby('customerId').tail(1)

    X_train = np.zeros(len(x_df)).reshape((-1,30))
    y_train = np.zeros(len(y_df))

    for i in range(0, len(X_train)):
        j = 30*i
        X_train[i] = x_df.iloc[j:j+30].returnTime.as_matrix()
        y_train[i] = y_df.iloc[i].returnTime
    
    return X_train.reshape(-1,30,1), y_train.reshape(-1,1)

In [9]:
from keras.models import Sequential  
from keras.layers.core import Dense, Activation  
from keras.layers.recurrent import LSTM

in_out_neurons = 1
hidden_neurons = 30

model = Sequential()
model.add(LSTM(hidden_neurons, return_sequences=False,
               input_shape=(None, in_out_neurons)))
model.add(Dense(in_out_neurons, input_dim=hidden_neurons))  
model.add(Activation("linear"))  
# model.compile(loss="mean_squared_error", optimizer="adam")
model.compile(loss="mean_squared_error", optimizer=Adam(lr=200))

In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 30)                3840      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 31        
_________________________________________________________________
activation_1 (Activation)    (None, 1)                 0         
Total params: 3,871.0
Trainable params: 3,871.0
Non-trainable params: 0.0
_________________________________________________________________


In [11]:
X_train, y_train = _load_data()

KeyboardInterrupt: 

In [None]:
log_dir = 'rnn_max_30_lr_2000'
model.fit(X_train, y_train, batch_size=10000, epochs=5000, validation_split=0.2, verbose=0
          , callbacks=[
            TensorBoard(log_dir='../../logs/{}'.format(log_dir), histogram_freq=100)
#             , EarlyStopping(monitor='val_loss', min_delta=0, patience=100, verbose=1, mode='auto')
            ]
         )

In [None]:
predicted = model.predict(X_train)  

In [None]:
rmse = sqrt(mean_squared_error(y_train, predicted))

In [None]:
str(datetime.timedelta(seconds=rmse))

### Validation error: 46.5 days

In [None]:
str(datetime.timedelta(seconds=16639000000000))

# Using more features

## Session length

In [12]:
def _load_data_sessionLen():
    n_sessions = 31
    active_users = obs_df.groupby('customerId').filter(lambda x: len(x) >= n_sessions)
    act_df = active_users.groupby('customerId').tail(n_sessions)
    #     return active_users

    x_df = act_df.groupby('customerId').head(30)
    y_df = act_df.groupby('customerId').tail(1)

    X_train = np.zeros(len(x_df)*2).reshape((-1,30,2))
    y_train = np.zeros(len(y_df))

    for i in range(0, len(X_train)):
        j = 30*i
        X_train[i] = x_df[['returnTime','sessionLength']].iloc[j:j+30].as_matrix()
        y_train[i] = y_df.iloc[i].returnTime
    
    return X_train.reshape(-1,30,2), y_train.reshape(-1,1)

In [13]:
def sessionLenModel():
    in_neurons = 2
    out_neurons = 1
    hidden_neurons = 30

    model = Sequential()
    model.add(LSTM(hidden_neurons, return_sequences=False,
                   input_shape=(None, in_neurons)))
    model.add(Dense(out_neurons, input_dim=hidden_neurons))  
    model.add(Activation("linear"))  
    # model.compile(loss="mean_squared_error", optimizer="adam")
    model.compile(loss="mean_squared_error", optimizer=Adam(lr=200))
    
    return model

In [15]:
sl_model = sessionLenModel()

In [14]:
X_train, y_train = _load_data_sessionLen()

In [17]:
log_dir = 'rnn_sessionLen_max_30_lr_2000'
sl_model.fit(X_train, y_train, batch_size=10000, epochs=5000, validation_split=0.2, verbose=0
          , callbacks=[
            TensorBoard(log_dir='../../logs/{}'.format(log_dir), histogram_freq=100)
#             , EarlyStopping(monitor='val_loss', min_delta=0, patience=100, verbose=1, mode='auto')
            ]
         )

INFO:tensorflow:Summary name lstm_2/kernel:0 is illegal; using lstm_2/kernel_0 instead.
INFO:tensorflow:Summary name lstm_2/recurrent_kernel:0 is illegal; using lstm_2/recurrent_kernel_0 instead.
INFO:tensorflow:Summary name lstm_2/bias:0 is illegal; using lstm_2/bias_0 instead.
INFO:tensorflow:Summary name dense_2/kernel:0 is illegal; using dense_2/kernel_0 instead.
INFO:tensorflow:Summary name dense_2/bias:0 is illegal; using dense_2/bias_0 instead.


KeyboardInterrupt: 