In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import LGBMModel
from os import path
from sklearn.metrics import mean_squared_error

%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.10f' % x)

In [6]:
data_dir = '../datasets'

if not path.exists(data_dir):
    raise Exception('{} directory not found.'.format(data_dir))

lgbm_val_file = '{}/{}'.format(data_dir, 'lgbm_pred_val.zip')
print('\nLGBM Val Prediction file: {}'.format(lgbm_val_file))

lgbm_test_file = '{}/{}'.format(data_dir, 'lgbm_pred_test.zip')
print('\nLGBM Val Prediction file: {}'.format(lgbm_test_file))

lstm_val_file = '{}/{}'.format(data_dir, 'lstm_pred_val.zip')
print('\nLSTM Val Prediction file: {}'.format(lstm_val_file))

lstm_test_file = '{}/{}'.format(data_dir, 'lstm_pred_test.zip')
print('\nLSTM Test Prediction file: {}'.format(lstm_test_file))


LGBM Val Prediction file: ../datasets/lgbm_pred_val.zip

LGBM Val Prediction file: ../datasets/lgbm_pred_test.zip

LSTM Val Prediction file: ../datasets/lstm_pred_val.zip

LSTM Test Prediction file: ../datasets/lstm_pred_test.zip


In [4]:
def load_data(zip_path):
    df = pd.read_csv(
        zip_path,
        dtype={'fullVisitorId': 'str'},
        compression='zip'
    )
    
    [rows, columns] = df.shape

    print('Loaded {} rows with {} columns from {}.'.format(
        rows, columns, zip_path
    ))
    
    return df

In [7]:
%%time

lstm_val_df = load_data(lstm_val_file)
lstm_test_df = load_data(lstm_test_file)

lgbm_val_df = load_data(lgbm_val_file)
lgbm_test_df = load_data(lgbm_test_file)

print()

Loaded 109687 rows with 3 columns from ../datasets/lstm_pred_val.zip.
Loaded 616642 rows with 2 columns from ../datasets/lstm_pred_test.zip.
Loaded 110252 rows with 3 columns from ../datasets/lgbm_pred_val.zip.
Loaded 617242 rows with 2 columns from ../datasets/lgbm_pred_test.zip.

CPU times: user 1.07 s, sys: 117 ms, total: 1.19 s
Wall time: 1.25 s


In [8]:
lstm_val_df.head()

Unnamed: 0,fullVisitorId,transactionRevenue,predictedRevenue
0,62267706107999,0.0,0.0
1,85059828173212,0.0,0.0
2,26722803385797,0.0,0.0
3,436683523507380,0.0,1.7182818285
4,450371054833295,0.0,0.0


In [9]:
lstm_test_df.head()

Unnamed: 0,fullVisitorId,predictedRevenue
0,259678714014,1.7182818285
1,49363351866189,0.0
2,53049821714864,0.0
3,59488412965267,0.0
4,85840370633780,0.0


In [10]:
lgbm_val_df.head()

Unnamed: 0,fullVisitorId,transactionRevenue,predictedRevenue
0,62267706107999,0.0,0.0
1,85059828173212,0.0,0.0
2,26722803385797,0.0,0.0
3,436683523507380,0.0,1.0636517332
4,450371054833295,0.0,0.0


In [11]:
lgbm_test_df.head()

Unnamed: 0,fullVisitorId,predictedRevenue
0,259678714014,2.4062848919
1,49363351866189,0.0
2,53049821714864,0.0
3,59488412965267,0.0
4,85840370633780,0.0417426113


In [15]:
merged_test = pd.merge(lgbm_test_df, lstm_test_df, on='fullVisitorId')

In [16]:
merged_test

Unnamed: 0,fullVisitorId,predictedRevenue_x,predictedRevenue_y
0,0000000259678714014,2.4062848919,1.7182818285
1,0000049363351866189,0.0000000000,0.0000000000
2,0000053049821714864,0.0000000000,0.0000000000
3,0000059488412965267,0.0000000000,0.0000000000
4,0000085840370633780,0.0417426113,0.0000000000
5,0000091131414287111,0.0000000000,0.0000000000
6,0000117255350596610,534.8072686676,1.7182818285
7,0000118334805178127,0.0000000000,0.0000000000
8,0000130646294093000,0.0000000000,0.0000000000
9,0000150005271820273,0.0000000000,0.0000000000
