In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import LGBMModel
from os import path
from sklearn.metrics import mean_squared_error

from keras.layers import Dense, Dropout, LSTM, Bidirectional
from keras.models import Sequential

%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.10f' % x)

Using TensorFlow backend.


In [2]:
data_dir = '../datasets'

if not path.exists(data_dir):
    raise Exception('{} directory not found.'.format(data_dir))

train_file = '{}/{}'.format(data_dir, 'train.zip')
print('\nTrain file: {}'.format(train_file))

val_file = '{}/{}'.format(data_dir, 'val.zip')
print('\nValidation file: {}'.format(val_file))

pred_val_file = '{}/{}'.format(data_dir, 'lstm_pred_val.zip')
print('\nValidation Prediction file: {}'.format(pred_val_file))

test_file = '{}/{}'.format(data_dir, 'test.zip')
print('\nTest file: {}'.format(test_file))

pred_test_file = '{}/{}'.format(data_dir, 'lstm_pred_test.zip')
print('\nTest Prediction file: {}'.format(pred_test_file))

imp_features_file = '{}/{}'.format(data_dir, 'lstm_importances-01.png')
print('\nImportant Features file: {}'.format(imp_features_file))


Train file: ../datasets/train.zip

Validation file: ../datasets/val.zip

Validation Prediction file: ../datasets/lstm_pred_val.zip

Test file: ../datasets/test.zip

Test Prediction file: ../datasets/lstm_pred_test.zip

Important Features file: ../datasets/lstm_importances-01.png


In [3]:
def load_data(zip_path):
    df = pd.read_csv(
        zip_path,
        dtype={'fullVisitorId': 'str'},
        compression='zip'
    )
    
    [rows, columns] = df.shape

    print('Loaded {} rows with {} columns from {}.'.format(
        rows, columns, zip_path
    ))
    
    return df

In [4]:
%%time

train_df = load_data(train_file)
val_df = load_data(val_file)
test_df = load_data(test_file)

print()

Loaded 765707 rows with 26 columns from ../datasets/train.zip.
Loaded 137946 rows with 26 columns from ../datasets/val.zip.
Loaded 804684 rows with 25 columns from ../datasets/test.zip.

CPU times: user 8.69 s, sys: 645 ms, total: 9.34 s
Wall time: 10.1 s


In [5]:
train_id = train_df['fullVisitorId'].values
val_id = val_df['fullVisitorId'].values
test_id = test_df['fullVisitorId'].values

train_y = train_df['totals.transactionRevenue'].values
train_log_y = np.log1p(train_y)

val_y = val_df['totals.transactionRevenue'].values
val_log_y = np.log1p(val_y)

train_X = train_df.drop(['totals.transactionRevenue', 'fullVisitorId'], axis=1).values
val_X = val_df.drop(['totals.transactionRevenue', 'fullVisitorId'], axis=1).values
test_X = test_df.drop(['fullVisitorId'], axis=1).values

In [6]:
header = pd.MultiIndex.from_product(
    [['Raw','Transformed'], ['Rows', 'Columns']],
    names=['Type','Dataset']
)

shape_df = pd.DataFrame(
    [train_df.shape + train_X.shape, val_df.shape + val_X.shape, test_df.shape + test_X.shape], 
    index=['Train', 'Validation', 'Test'], 
    columns=header
)

shape_df.style.set_table_styles([
    {'selector': 'th', 'props': [('text-align', 'center')]}
])

Type,Raw,Raw,Transformed,Transformed
Dataset,Rows,Columns,Rows,Columns
Train,765707,26,765707,24
Validation,137946,26,137946,24
Test,804684,25,804684,24


In [7]:
shaped_train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
shaped_val_X = val_X.reshape((val_X.shape[0], 1, val_X.shape[1]))
shaped_test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

header = pd.MultiIndex.from_product(
    [['Rows', 'Batch', 'Columns']],
    names=['Dataset']
)

shape_df = pd.DataFrame(
    [shaped_train_X.shape, shaped_val_X.shape, shaped_test_X.shape], 
    index=['Train', 'Validation', 'Test'], 
    columns=header
)

shape_df.head()

Dataset,Rows,Batch,Columns
Train,765707,1,24
Validation,137946,1,24
Test,804684,1,24


In [8]:
def lstm_model(train_X, train_y, val_X, val_y):
    model = Sequential()

    model.add(
        Bidirectional(
            LSTM(
                256,
                recurrent_dropout=0.2, 
                kernel_initializer='lecun_normal', 
                return_sequences=True,
                input_shape = (shaped_train_X.shape[1], shaped_train_X.shape[2])
            )
        )
    )
    model.add(
        Bidirectional(
            LSTM(
                128,
                recurrent_dropout=0.2, 
                kernel_initializer='lecun_normal'
            )
        )
    )
    model.add(Dense(50, activation='sigmoid'))
    model.add(Dropout(0.1))
    model.add(Dense(20,activation='relu'))
    model.add(Dense(1,activation='linear'))

    model.compile(optimizer='adam', loss='mse')

    model.fit(
        shaped_train_X, 
        train_log_y, 
        epochs=5, 
        batch_size=64, 
        validation_data=(shaped_val_X, val_log_y), 
        validation_freq=2,
        verbose=1,
        shuffle=False
    )
    
    return model

In [9]:
model = lstm_model(shaped_train_X, train_log_y, shaped_val_X, val_log_y)

Train on 765707 samples, validate on 137946 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
pred_val = model.predict(shaped_val_X)

In [11]:
pred_val = pred_val.reshape(-1)

pred_val[pred_val < 0] = 0

pred_val_data = {
    'fullVisitorId': val_id,
    'transactionRevenue': val_y,
    'predictedRevenue': np.expm1(pred_val)
}

pred_val_df = pd.DataFrame(pred_val_data)

pred_val_df = pred_val_df.groupby('fullVisitorId')['transactionRevenue', 'predictedRevenue'].sum().reset_index()

pred_val_df.head()

Unnamed: 0,fullVisitorId,transactionRevenue,predictedRevenue
0,62267706107999,0.0,0.0081076892
1,85059828173212,0.0,0.0081044147
2,26722803385797,0.0,0.0089559462
3,436683523507380,0.0,1.3142076731
4,450371054833295,0.0,0.00810102


In [12]:
rsme_val = np.sqrt(
    mean_squared_error(
        np.log1p(pred_val_df['transactionRevenue'].values),
        np.log1p(pred_val_df['predictedRevenue'].values)
    )
)

print('\nRSME for validation data set: {:.4f}'.format(rsme_val))


RSME for validation data set: 1.7806


In [13]:
pred_test = model.predict(shaped_test_X)

In [14]:
pred_test = pred_test.reshape(-1)

pred_test[pred_test < 0] = 0

pred_test_data = {
    'fullVisitorId': test_id,
    'predictedRevenue': np.expm1(pred_test)
}

pred_test_df = pd.DataFrame(pred_test_data)

pred_test_df = pred_test_df.groupby('fullVisitorId')['predictedRevenue'].sum().reset_index()

pred_test_df.head()

Unnamed: 0,fullVisitorId,predictedRevenue
0,259678714014,45.4991645813
1,49363351866189,0.0243037809
2,53049821714864,0.0081150206
3,59488412965267,0.0243103914
4,85840370633780,0.0118847825


In [15]:
pred_val_df.to_csv(pred_val_file, index=False, compression='zip')
pred_test_df.to_csv(pred_test_file, index=False, compression='zip')

In [29]:
# Do feature importances

from eli5.sklearn import PermutationImportance

from sklearn.inspection import permutation_importance
from sklearn.metrics import SCORERS

perm = PermutationImportance(model, random_state=1, scoring='neg_mean_squared_error').fit(shaped_train_X, train_log_y)
# eli5.show_weights(perm, feature_names = X.columns.tolist())
# result = permutation_importance(model, shaped_train_X, train_log_y, random_state=1, scoring='neg_mean_squared_error')
# base_score, score_decreases = get_score_importances(model, train_X, train_log_y, random_state=1)


print(result)
print()
# print(score_decreases)

ValueError: Found array with dim 3. Estimator expected <= 2.