In [1]:
import gc
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

from livelossplot import PlotLossesKeras
from scipy.stats import norm, probplot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, CuDNNLSTM, Dropout, Activation, Bidirectional, TimeDistributed

Using TensorFlow backend.


In [2]:
IN_TRAIN = 'in/train-wrangled.csv'
IN_TEST = 'in/test-wrangled.csv'

LABEL = 'totals.transactionRevenue'

In [3]:
df = pd.read_csv(IN_TRAIN, dtype={'fullVisitorId': 'str'}, low_memory=False)
df.head()

Unnamed: 0,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device.isMobile,totals.bounces,totals.hits,totals.newVisits,...,geoNetwork.subContinent Western Asia,geoNetwork.subContinent Western Europe,trafficSource.source (direct),trafficSource.source Other,trafficSource.source Partners,trafficSource.source analytics.google.com,trafficSource.source google,trafficSource.source mall.googleplex.com,trafficSource.source youtube.com,totals.transactionRevenue
0,0.0101,1131660440785968503,1131660440785968503_1472830385,0.088405,0.0,0.088405,0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,
1,0.0101,377306020877927890,377306020877927890_1472880147,0.089979,0.0,0.089979,0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,
2,0.0101,3895546263509774583,3895546263509774583_1472865386,0.089512,0.0,0.089512,0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,
3,0.0101,4763447161404445595,4763447161404445595_1472881213,0.090012,0.0,0.090012,0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,
4,0.0101,27294437909732085,27294437909732085_1472822600,0.088159,0.002538,0.088159,0,0.0,0.0,,...,0,0,0,0,0,0,1,0,0,


In [4]:
num_features = df.shape[1]

def gen_Xy(data):
    data = np.nan_to_num(data)
    num_features = data.shape[1]
    X = data[:, :num_features - 1]  # remove the last col (the label - remember we moved it to be the last col)
    y = data[:, num_features - 1].sum()
    return (X, np.log1p(y), int(np.log1p(y)>0))

samples=[]
time_sorted_df = df.sort_values(['date', 'visitStartTime'])
visitor_grouped_df = time_sorted_df.groupby('fullVisitorId', axis=0, sort=False)
for visitor_id, visitor_group in visitor_grouped_df:
    pruned_group = visitor_group.drop(['fullVisitorId', 'sessionId'], axis=1)
    sample = gen_Xy(pruned_group.values)
    samples.append(sample)
    
samples[0]

(array([[4.00000000e-04, 1.22194783e-02, 0.00000000e+00, 1.22194783e-02,
         0.00000000e+00, 0.00000000e+00, 2.80561122e-02, 0.00000000e+00,
         2.56410256e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
         0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000

In [5]:
train_set, test_set = train_test_split(samples, test_size=.2, random_state=1, shuffle=False)

train_set = sorted(train_set, key=lambda x: x[0].shape[0])
test_set = sorted(test_set, key=lambda x: x[0].shape[0])

num_features = samples[0][0].shape[1]
num_features

157

In [34]:
# We need to manually calculate the batches, since we are using variable length
# All the items in one batch have the same length
def calc_num_batches(data, batch_size):
    current_sequence_length = 1
    num_batches = 0
    current_batch_size = 0
    for i in range(len(data)):
        sequence_length = data[i][0].shape[0]
        if sequence_length > current_sequence_length:
            current_sequence_length = sequence_length
            current_batch_size = 0  # a new batch starts here, since the sequence length changes
        if current_batch_size % batch_size == 0:
            num_batches += 1
        current_batch_size += 1
    return num_batches
    
def batch_iter(data, batch_size):
    num_batches_per_epoch = calc_num_batches(data, batch_size)

    def data_generator():
        data_size = len(data)
        while True:
            current_sequence_length = 1
            end_index=0
            while True:
                start_index = end_index
                end_index = min(start_index + batch_size, data_size)
                X = []
                y = []
                i = start_index;
                bi=1
                no_b = end_index - start_index
                while True:
                #for i in range(start_index, end_index):
                    if (i>end_index):
                        break
                    if (bi>no_b):
                        break
                    if (data[i][1]<=0):
                        continue;
                    bi=bi+1
                    sequence_length = data[i][0].shape[0]
                    # one batch should have all examples of the same length
                    # when we reach a greater sequence length, we stop the iteration and return the batch as is
                    if sequence_length > current_sequence_length:
                        current_sequence_length = sequence_length
                        break
                    X.append(data[i][0])
                    #y.append(data[i][1] if data[i][1] else 0)
                    y.append(data[i][1])
                    i=i+1
                if not y:
                    continue
                # stack arrays to create 3d numpy arrays
                X_ndarr = np.dstack(X)
                y_ndarr = np.dstack(y)
                # move axes in the order required by the model
                X_ndarr = np.moveaxis(X_ndarr, 2, 0)
                y_ndarr = y_ndarr.reshape(y_ndarr.shape[2], y_ndarr.shape[0])
                yield X_ndarr, y_ndarr

                if end_index == data_size:
                    break

    return data_generator(), num_batches_per_epoch

In [35]:
def batch_iter_class(data, batch_size):
    num_batches_per_epoch = calc_num_batches(data, batch_size)

    def data_generator():
        data_size = len(data)
        while True:
            current_sequence_length = 1
            index = 0
            while True:
                start_index = index
                end_index = min(start_index + batch_size, data_size)
                X = []
                y = []

                for i in range(start_index, end_index):
                    sequence_length = data[i][0].shape[0]
                    # one batch should have all examples of the same length
                    # when we reach a greater sequence length, we stop the iteration and return the batch as is
                    if sequence_length > current_sequence_length:
                        current_sequence_length = sequence_length
                        break
                    X.append(data[i][0])
                    y.append(data[i][2])
                    index += 1

                # stack arrays to create 3d numpy arrays
                X_ndarr = np.dstack(X)
                y_ndarr = np.dstack(y)
                # move axes in the order required by the model
                X_ndarr = np.moveaxis(X_ndarr, 2, 0)
                y_ndarr = y_ndarr.reshape(y_ndarr.shape[2], y_ndarr.shape[0])
                yield X_ndarr, y_ndarr

                if index == data_size:
                    break

    return data_generator(), num_batches_per_epoch

In [None]:
batch_size = 128
train_batches, train_steps = batch_iter(train_set, batch_size)
train_batches_c, train_steps_c = batch_iter_class(train_set, batch_size)

# Just a simple validation of batch shapes to make sure our generator is fine
# The second dimension of the shape should be monotonically increasing up until the end
i = 0
for train_batch in train_batches:
    print(i, train_batch[0].shape)
    i += 1
    if i == train_steps_c:
        break

In [None]:
def build_model_class(neurons=128, activ_func='relu', dropout=.3, loss='mean_squared_error', optimizer='adam'):
    model = Sequential()

    model.add(CuDNNLSTM(neurons, return_sequences=True, input_shape=(None, num_features)))
    model.add(Dropout(dropout))
    model.add(CuDNNLSTM(neurons, return_sequences=False))
    model.add(Dropout(dropout))
#     model.add(LSTM(neurons, return_sequences=False, activation=activ_func))
#     model.add(Dropout(dropout))
    
    model.add(Dense(neurons, kernel_initializer='normal', activation=activ_func))
    model.add(Dropout(dropout))
    model.add(Dense(24, kernel_initializer='normal', activation=activ_func))
    model.add(Dropout(dropout))
    model.add(Dense(2, kernel_initializer='normal',activation='softmax'))

    model.compile(loss=loss, optimizer=optimizer, metrics=['categorical_crossentropy'])
    return model

In [None]:
model_c = build_model_class()
model.summary()

In [None]:
gc.collect() # clean up the memory
num_epochs = 15

train_batches_c, train_steps_c = batch_iter_class(train_set, batch_size)
test_batches_c, test_steps_c = batch_iter_class(test_set, batch_size)

# train model on data
model.fit_generator(train_batches, train_steps,
          epochs=num_epochs,
          validation_data=test_batches, validation_steps=test_steps)

In [29]:
def build_model(neurons=128, activ_func='relu', dropout=.3, loss='mean_squared_error', optimizer='adam'):
    model = Sequential()

    model.add(CuDNNLSTM(neurons, return_sequences=True, input_shape=(None, num_features)))
    model.add(Dropout(dropout))
    model.add(CuDNNLSTM(neurons, return_sequences=False))
    model.add(Dropout(dropout))
#     model.add(LSTM(neurons, return_sequences=False, activation=activ_func))
#     model.add(Dropout(dropout))
    
    model.add(Dense(neurons, kernel_initializer='normal', activation=activ_func))
    model.add(Dropout(dropout))
    model.add(Dense(24, kernel_initializer='normal', activation=activ_func))
    model.add(Dropout(dropout))
    model.add(Dense(1, kernel_initializer='normal'))

    model.compile(loss=loss, optimizer=optimizer, metrics=['mse'])
    return model

In [30]:
model = build_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnnlstm_11 (CuDNNLSTM)    (None, None, 128)         146944    
_________________________________________________________________
dropout_11 (Dropout)         (None, None, 128)         0         
_________________________________________________________________
cu_dnnlstm_12 (CuDNNLSTM)    (None, 128)               132096    
_________________________________________________________________
dropout_12 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 64)                8256      
_________________________________________________________________
dropout_13 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 24)                1560      
__________

In [31]:
gc.collect() # clean up the memory
num_epochs = 10

train_batches, train_steps = batch_iter(train_set, batch_size)
test_batches, test_steps = batch_iter(test_set, batch_size)

# train model on data
model.fit_generator(train_batches, train_steps,
          epochs=num_epochs,
          validation_data=test_batches, validation_steps=test_steps)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc065583630>

In [32]:
num_visitor_ids = len(visitor_grouped_df)
predicted_revenues = {}
i = 0
for visitor_id, visitor_group in visitor_grouped_df:
    X = samples[i][0]
    num_timesteps = X.shape[0]
    predicted_revenue = np.expm1(2+model.predict(X.reshape(1, num_timesteps, num_features))).sum()
    true_revenue = visitor_group[LABEL].sum()
    i = i+1
    
    predicted_revenues[visitor_id] = (predicted_revenue, true_revenue)
    
    if i % 1000 == 0:
        print(i, 'of', num_visitor_ids)

1000 of 714167
2000 of 714167
3000 of 714167
4000 of 714167
5000 of 714167
6000 of 714167
7000 of 714167
8000 of 714167
9000 of 714167
10000 of 714167
11000 of 714167
12000 of 714167
13000 of 714167
14000 of 714167
15000 of 714167
16000 of 714167
17000 of 714167
18000 of 714167
19000 of 714167
20000 of 714167
21000 of 714167
22000 of 714167
23000 of 714167
24000 of 714167
25000 of 714167
26000 of 714167
27000 of 714167
28000 of 714167
29000 of 714167
30000 of 714167
31000 of 714167
32000 of 714167
33000 of 714167
34000 of 714167
35000 of 714167
36000 of 714167
37000 of 714167
38000 of 714167
39000 of 714167
40000 of 714167
41000 of 714167
42000 of 714167
43000 of 714167
44000 of 714167
45000 of 714167
46000 of 714167
47000 of 714167
48000 of 714167
49000 of 714167
50000 of 714167
51000 of 714167
52000 of 714167
53000 of 714167
54000 of 714167
55000 of 714167
56000 of 714167
57000 of 714167
58000 of 714167
59000 of 714167
60000 of 714167
61000 of 714167
62000 of 714167
63000 of 714167
6

490000 of 714167
491000 of 714167
492000 of 714167
493000 of 714167
494000 of 714167
495000 of 714167
496000 of 714167
497000 of 714167
498000 of 714167
499000 of 714167
500000 of 714167
501000 of 714167
502000 of 714167
503000 of 714167
504000 of 714167
505000 of 714167
506000 of 714167
507000 of 714167
508000 of 714167
509000 of 714167
510000 of 714167
511000 of 714167
512000 of 714167
513000 of 714167
514000 of 714167
515000 of 714167
516000 of 714167
517000 of 714167
518000 of 714167
519000 of 714167
520000 of 714167
521000 of 714167
522000 of 714167
523000 of 714167
524000 of 714167
525000 of 714167
526000 of 714167
527000 of 714167
528000 of 714167
529000 of 714167
530000 of 714167
531000 of 714167
532000 of 714167
533000 of 714167
534000 of 714167
535000 of 714167
536000 of 714167
537000 of 714167
538000 of 714167
539000 of 714167
540000 of 714167
541000 of 714167
542000 of 714167
543000 of 714167
544000 of 714167
545000 of 714167
546000 of 714167
547000 of 714167
548000 of 7141

In [33]:
from sklearn import metrics

pred = [rev[0] for rev in list(predicted_revenues.values())]
true = [rev[1] for rev in list(predicted_revenues.values())]

print(np.sqrt(metrics.mean_squared_error(np.log1p(pred), np.log1p(true))))

2.8002688934921003


In [13]:
out_
for visitor_id, rev in predicted_revenues.items():
    

SyntaxError: unexpected EOF while parsing (<ipython-input-13-17719a05e519>, line 3)