In [1]:
from tqdm import tqdm
import pathlib
import json
import numpy as np
import pandas as pd

In [2]:
dataset_directory = pathlib.Path('../input/otto-recommender-system')

In [3]:
type_dict = {
    'clicks': 0,
    'carts': 1,
    'orders': 2
}

def create_ground_truth(events):
    
    """
    Create ground-truth labels from given list of event dictionaries
    
    Parameters
    ----------
    events: list of shape (n_events)
        List of event dictionaries
 
    Returns
    -------
    events: list of shape (n_events)
        List of event dictionaries with labels
    """
    
    previous_labels = {'clicks': None, 'carts': set(), 'orders': set()}

    for event in reversed(events):
        
        event['labels'] = {}

        for label in ['clicks', 'carts', 'orders']:
            if previous_labels[label]:
                if label != 'clicks':
                    event['labels'][type_dict[label]] = previous_labels[label].copy()
                else:
                    event['labels'][type_dict[label]] = previous_labels[label]

        if event['type'] == 'clicks':
            previous_labels['clicks'] = event['aid']
        if event['type'] == 'carts':
            previous_labels['carts'].add(event['aid'])
        elif event['type'] == 'orders':
            previous_labels['orders'].add(event['aid'])

    return events


def create_dataframe(json_file_path, chunk_size):
    
    """
    Create pandas.DataFrame from given json_file_path
    
    Parameters
    ----------
    json_file_path: path-like str
        Path of the json file
        
    chunk_size: int
        Size of chunks while reading the json file
        
    Returns
    -------
    df: pandas.DataFrame of shape (n_samples, 4)
    """
    
    chunks = pd.read_json(json_file_path, lines=True, chunksize=chunk_size)
    df = pd.DataFrame()
    
    for chunk_idx, chunk in enumerate(chunks):
        
        print(f'Reading Chunk {chunk_idx} ({chunk_size * chunk_idx}-{(chunk_size * (chunk_idx + 1))})')
        
        event_dict = {
            'session': [],
            'aid': [],
            'ts': [],
            'type': []
        }
        
        for session, events in tqdm(zip(chunk['session'].tolist(), chunk['events'].tolist()), total=len(chunk['session'].tolist())):
            
            for event in events:
                
                event_dict['session'].append(session)
                event_dict['aid'].append(event['aid'])
                event_dict['ts'].append(event['ts'])
                event_dict['type'].append(type_dict[event['type']])

        chunk_session = pd.DataFrame(event_dict)
        chunk_session['session'] = chunk_session['session'].astype(np.uint32)
        chunk_session['aid'] = chunk_session['aid'].astype(np.uint32)
        chunk_session['ts'] = pd.to_datetime(chunk_session['ts'], unit='ms')
        chunk_session['type'] = chunk_session['type'].astype(np.uint8)
        df = pd.concat([df, chunk_session])
        
    df.reset_index(drop=True, inplace=True)
        
    return df


In [4]:
df_train = create_dataframe(json_file_path=str(dataset_directory / 'train.jsonl'), chunk_size=100000)
df_train

Reading Chunk 0 (0-100000)


100%|██████████| 100000/100000 [00:03<00:00, 26652.03it/s]


Reading Chunk 1 (100000-200000)


100%|██████████| 100000/100000 [00:03<00:00, 27253.85it/s]


Reading Chunk 2 (200000-300000)


100%|██████████| 100000/100000 [00:03<00:00, 29369.75it/s]


Reading Chunk 3 (300000-400000)


100%|██████████| 100000/100000 [00:03<00:00, 31763.85it/s]


Reading Chunk 4 (400000-500000)


100%|██████████| 100000/100000 [00:03<00:00, 32285.64it/s]


Reading Chunk 5 (500000-600000)


100%|██████████| 100000/100000 [00:02<00:00, 33810.14it/s]


Reading Chunk 6 (600000-700000)


100%|██████████| 100000/100000 [00:02<00:00, 34725.73it/s]


Reading Chunk 7 (700000-800000)


100%|██████████| 100000/100000 [00:02<00:00, 34434.94it/s]


Reading Chunk 8 (800000-900000)


100%|██████████| 100000/100000 [00:02<00:00, 34986.57it/s]


Reading Chunk 9 (900000-1000000)


100%|██████████| 100000/100000 [00:02<00:00, 35654.90it/s]


Reading Chunk 10 (1000000-1100000)


100%|██████████| 100000/100000 [00:02<00:00, 37203.93it/s]


Reading Chunk 11 (1100000-1200000)


100%|██████████| 100000/100000 [00:02<00:00, 37068.15it/s]


Reading Chunk 12 (1200000-1300000)


100%|██████████| 100000/100000 [00:03<00:00, 33147.21it/s]


Reading Chunk 13 (1300000-1400000)


100%|██████████| 100000/100000 [00:02<00:00, 35158.42it/s]


Reading Chunk 14 (1400000-1500000)


100%|██████████| 100000/100000 [00:02<00:00, 39571.91it/s]


Reading Chunk 15 (1500000-1600000)


100%|██████████| 100000/100000 [00:02<00:00, 38317.34it/s]


Reading Chunk 16 (1600000-1700000)


100%|██████████| 100000/100000 [00:02<00:00, 40890.06it/s]


Reading Chunk 17 (1700000-1800000)


100%|██████████| 100000/100000 [00:02<00:00, 41862.00it/s]


Reading Chunk 18 (1800000-1900000)


100%|██████████| 100000/100000 [00:02<00:00, 42129.98it/s]


Reading Chunk 19 (1900000-2000000)


100%|██████████| 100000/100000 [00:02<00:00, 42452.07it/s]


Reading Chunk 20 (2000000-2100000)


100%|██████████| 100000/100000 [00:02<00:00, 45769.05it/s]


Reading Chunk 21 (2100000-2200000)


100%|██████████| 100000/100000 [00:02<00:00, 42710.04it/s]


Reading Chunk 22 (2200000-2300000)


100%|██████████| 100000/100000 [00:02<00:00, 41559.49it/s]


Reading Chunk 23 (2300000-2400000)


100%|██████████| 100000/100000 [00:02<00:00, 44758.28it/s]


Reading Chunk 24 (2400000-2500000)


100%|██████████| 100000/100000 [00:02<00:00, 47352.09it/s]


Reading Chunk 25 (2500000-2600000)


100%|██████████| 100000/100000 [00:02<00:00, 45815.50it/s]


Reading Chunk 26 (2600000-2700000)


100%|██████████| 100000/100000 [00:02<00:00, 47273.92it/s]


Reading Chunk 27 (2700000-2800000)


100%|██████████| 100000/100000 [00:02<00:00, 47400.04it/s]


Reading Chunk 28 (2800000-2900000)


100%|██████████| 100000/100000 [00:02<00:00, 49910.79it/s]


Reading Chunk 29 (2900000-3000000)


100%|██████████| 100000/100000 [00:02<00:00, 48143.28it/s]


Reading Chunk 30 (3000000-3100000)


100%|██████████| 100000/100000 [00:02<00:00, 41741.87it/s]


Reading Chunk 31 (3100000-3200000)


100%|██████████| 100000/100000 [00:02<00:00, 49690.26it/s]


Reading Chunk 32 (3200000-3300000)


100%|██████████| 100000/100000 [00:01<00:00, 51488.77it/s]


Reading Chunk 33 (3300000-3400000)


100%|██████████| 100000/100000 [00:01<00:00, 51895.92it/s]


Reading Chunk 34 (3400000-3500000)


100%|██████████| 100000/100000 [00:01<00:00, 53079.99it/s]


Reading Chunk 35 (3500000-3600000)


100%|██████████| 100000/100000 [00:01<00:00, 52231.07it/s]


Reading Chunk 36 (3600000-3700000)


100%|██████████| 100000/100000 [00:01<00:00, 54821.26it/s]


Reading Chunk 37 (3700000-3800000)


100%|██████████| 100000/100000 [00:01<00:00, 50205.16it/s]


Reading Chunk 38 (3800000-3900000)


100%|██████████| 100000/100000 [00:01<00:00, 52952.37it/s]


Reading Chunk 39 (3900000-4000000)


100%|██████████| 100000/100000 [00:01<00:00, 53036.96it/s]


Reading Chunk 40 (4000000-4100000)


100%|██████████| 100000/100000 [00:01<00:00, 53364.19it/s]


Reading Chunk 41 (4100000-4200000)


100%|██████████| 100000/100000 [00:01<00:00, 53655.40it/s]


Reading Chunk 42 (4200000-4300000)


100%|██████████| 100000/100000 [00:01<00:00, 53017.90it/s]


Reading Chunk 43 (4300000-4400000)


100%|██████████| 100000/100000 [00:01<00:00, 51617.57it/s]


Reading Chunk 44 (4400000-4500000)


100%|██████████| 100000/100000 [00:01<00:00, 55136.61it/s]


Reading Chunk 45 (4500000-4600000)


100%|██████████| 100000/100000 [00:02<00:00, 46243.05it/s]


Reading Chunk 46 (4600000-4700000)


100%|██████████| 100000/100000 [00:01<00:00, 54975.90it/s]


Reading Chunk 47 (4700000-4800000)


100%|██████████| 100000/100000 [00:01<00:00, 56643.77it/s]


Reading Chunk 48 (4800000-4900000)


100%|██████████| 100000/100000 [00:01<00:00, 52290.39it/s]


Reading Chunk 49 (4900000-5000000)


100%|██████████| 100000/100000 [00:01<00:00, 54597.99it/s]


Reading Chunk 50 (5000000-5100000)


100%|██████████| 100000/100000 [00:01<00:00, 50451.22it/s]


Reading Chunk 51 (5100000-5200000)


100%|██████████| 100000/100000 [00:01<00:00, 58017.89it/s]


Reading Chunk 52 (5200000-5300000)


100%|██████████| 100000/100000 [00:01<00:00, 59624.40it/s]


Reading Chunk 53 (5300000-5400000)


100%|██████████| 100000/100000 [00:01<00:00, 56966.20it/s]


Reading Chunk 54 (5400000-5500000)


100%|██████████| 100000/100000 [00:01<00:00, 57452.16it/s]


Reading Chunk 55 (5500000-5600000)


100%|██████████| 100000/100000 [00:01<00:00, 59180.35it/s]


Reading Chunk 56 (5600000-5700000)


100%|██████████| 100000/100000 [00:01<00:00, 59923.58it/s]


Reading Chunk 57 (5700000-5800000)


100%|██████████| 100000/100000 [00:01<00:00, 58634.30it/s]


Reading Chunk 58 (5800000-5900000)


100%|██████████| 100000/100000 [00:01<00:00, 53053.01it/s]


Reading Chunk 59 (5900000-6000000)


100%|██████████| 100000/100000 [00:01<00:00, 64975.20it/s]


Reading Chunk 60 (6000000-6100000)


100%|██████████| 100000/100000 [00:01<00:00, 65968.96it/s]


Reading Chunk 61 (6100000-6200000)


100%|██████████| 100000/100000 [00:01<00:00, 67231.71it/s]


Reading Chunk 62 (6200000-6300000)


100%|██████████| 100000/100000 [00:01<00:00, 64564.75it/s]


Reading Chunk 63 (6300000-6400000)


100%|██████████| 100000/100000 [00:01<00:00, 66123.29it/s]


Reading Chunk 64 (6400000-6500000)


100%|██████████| 100000/100000 [00:01<00:00, 66915.31it/s]


Reading Chunk 65 (6500000-6600000)


100%|██████████| 100000/100000 [00:01<00:00, 67982.31it/s]


Reading Chunk 66 (6600000-6700000)


100%|██████████| 100000/100000 [00:01<00:00, 60331.18it/s]


Reading Chunk 67 (6700000-6800000)


100%|██████████| 100000/100000 [00:01<00:00, 68487.81it/s]


Reading Chunk 68 (6800000-6900000)


100%|██████████| 100000/100000 [00:01<00:00, 67624.61it/s]


Reading Chunk 69 (6900000-7000000)


100%|██████████| 100000/100000 [00:01<00:00, 71694.59it/s]


Reading Chunk 70 (7000000-7100000)


100%|██████████| 100000/100000 [00:01<00:00, 72669.49it/s]


Reading Chunk 71 (7100000-7200000)


100%|██████████| 100000/100000 [00:01<00:00, 71577.43it/s]


Reading Chunk 72 (7200000-7300000)


100%|██████████| 100000/100000 [00:01<00:00, 76116.28it/s]


Reading Chunk 73 (7300000-7400000)


100%|██████████| 100000/100000 [00:01<00:00, 64150.17it/s]


Reading Chunk 74 (7400000-7500000)


100%|██████████| 100000/100000 [00:01<00:00, 76466.98it/s]


Reading Chunk 75 (7500000-7600000)


100%|██████████| 100000/100000 [00:01<00:00, 73443.76it/s]


Reading Chunk 76 (7600000-7700000)


100%|██████████| 100000/100000 [00:01<00:00, 66100.04it/s]


Reading Chunk 77 (7700000-7800000)


100%|██████████| 100000/100000 [00:01<00:00, 76970.13it/s]


Reading Chunk 78 (7800000-7900000)


100%|██████████| 100000/100000 [00:01<00:00, 73524.08it/s]


Reading Chunk 79 (7900000-8000000)


100%|██████████| 100000/100000 [00:01<00:00, 81446.61it/s]


Reading Chunk 80 (8000000-8100000)


100%|██████████| 100000/100000 [00:01<00:00, 78262.18it/s]


Reading Chunk 81 (8100000-8200000)


100%|██████████| 100000/100000 [00:01<00:00, 78634.85it/s]


Reading Chunk 82 (8200000-8300000)


100%|██████████| 100000/100000 [00:01<00:00, 77824.11it/s]


Reading Chunk 83 (8300000-8400000)


100%|██████████| 100000/100000 [00:01<00:00, 67467.52it/s]


Reading Chunk 84 (8400000-8500000)


100%|██████████| 100000/100000 [00:01<00:00, 84333.67it/s]


Reading Chunk 85 (8500000-8600000)


100%|██████████| 100000/100000 [00:01<00:00, 85310.04it/s]


Reading Chunk 86 (8600000-8700000)


100%|██████████| 100000/100000 [00:01<00:00, 82986.37it/s]


Reading Chunk 87 (8700000-8800000)


100%|██████████| 100000/100000 [00:01<00:00, 79846.84it/s]


Reading Chunk 88 (8800000-8900000)


100%|██████████| 100000/100000 [00:01<00:00, 84160.63it/s]


Reading Chunk 89 (8900000-9000000)


100%|██████████| 100000/100000 [00:01<00:00, 80658.09it/s]


Reading Chunk 90 (9000000-9100000)


100%|██████████| 100000/100000 [00:01<00:00, 90275.57it/s]


Reading Chunk 91 (9100000-9200000)


100%|██████████| 100000/100000 [00:01<00:00, 83351.58it/s]


Reading Chunk 92 (9200000-9300000)


100%|██████████| 100000/100000 [00:01<00:00, 71373.36it/s]


Reading Chunk 93 (9300000-9400000)


100%|██████████| 100000/100000 [00:01<00:00, 85861.27it/s]


Reading Chunk 94 (9400000-9500000)


100%|██████████| 100000/100000 [00:01<00:00, 84118.48it/s]


Reading Chunk 95 (9500000-9600000)


100%|██████████| 100000/100000 [00:01<00:00, 79913.99it/s]


Reading Chunk 96 (9600000-9700000)


100%|██████████| 100000/100000 [00:01<00:00, 86111.00it/s]


Reading Chunk 97 (9700000-9800000)


100%|██████████| 100000/100000 [00:01<00:00, 83540.62it/s]


Reading Chunk 98 (9800000-9900000)


100%|██████████| 100000/100000 [00:01<00:00, 83671.85it/s]


Reading Chunk 99 (9900000-10000000)


100%|██████████| 100000/100000 [00:01<00:00, 73638.75it/s]


Reading Chunk 100 (10000000-10100000)


100%|██████████| 100000/100000 [00:01<00:00, 91076.12it/s]


Reading Chunk 101 (10100000-10200000)


100%|██████████| 100000/100000 [00:01<00:00, 89157.01it/s]


Reading Chunk 102 (10200000-10300000)


100%|██████████| 100000/100000 [00:01<00:00, 89008.94it/s]


Reading Chunk 103 (10300000-10400000)


100%|██████████| 100000/100000 [00:01<00:00, 91740.44it/s]


Reading Chunk 104 (10400000-10500000)


100%|██████████| 100000/100000 [00:01<00:00, 93018.30it/s]


Reading Chunk 105 (10500000-10600000)


100%|██████████| 100000/100000 [00:01<00:00, 91259.20it/s]


Reading Chunk 106 (10600000-10700000)


100%|██████████| 100000/100000 [00:01<00:00, 86751.18it/s]


Reading Chunk 107 (10700000-10800000)


100%|██████████| 100000/100000 [00:01<00:00, 90488.72it/s]


Reading Chunk 108 (10800000-10900000)


100%|██████████| 100000/100000 [00:01<00:00, 90194.54it/s]


Reading Chunk 109 (10900000-11000000)


100%|██████████| 100000/100000 [00:01<00:00, 90161.21it/s]


Reading Chunk 110 (11000000-11100000)


100%|██████████| 100000/100000 [00:01<00:00, 99052.62it/s]


Reading Chunk 111 (11100000-11200000)


100%|██████████| 100000/100000 [00:01<00:00, 93462.43it/s]


Reading Chunk 112 (11200000-11300000)


100%|██████████| 100000/100000 [00:01<00:00, 80211.14it/s]


Reading Chunk 113 (11300000-11400000)


100%|██████████| 100000/100000 [00:01<00:00, 96897.18it/s]


Reading Chunk 114 (11400000-11500000)


100%|██████████| 100000/100000 [00:01<00:00, 80098.75it/s]


Reading Chunk 115 (11500000-11600000)


100%|██████████| 100000/100000 [00:01<00:00, 92272.34it/s]


Reading Chunk 116 (11600000-11700000)


100%|██████████| 100000/100000 [00:01<00:00, 96273.41it/s]


Reading Chunk 117 (11700000-11800000)


100%|██████████| 100000/100000 [00:00<00:00, 100120.24it/s]


Reading Chunk 118 (11800000-11900000)


100%|██████████| 100000/100000 [00:00<00:00, 104106.50it/s]


Reading Chunk 119 (11900000-12000000)


100%|██████████| 100000/100000 [00:01<00:00, 95260.08it/s]


Reading Chunk 120 (12000000-12100000)


100%|██████████| 100000/100000 [00:00<00:00, 105039.66it/s]


Reading Chunk 121 (12100000-12200000)


100%|██████████| 100000/100000 [00:00<00:00, 102653.53it/s]


Reading Chunk 122 (12200000-12300000)


100%|██████████| 100000/100000 [00:00<00:00, 104955.55it/s]


Reading Chunk 123 (12300000-12400000)


100%|██████████| 100000/100000 [00:00<00:00, 103201.65it/s]


Reading Chunk 124 (12400000-12500000)


100%|██████████| 100000/100000 [00:00<00:00, 101923.16it/s]


Reading Chunk 125 (12500000-12600000)


100%|██████████| 100000/100000 [00:00<00:00, 105320.80it/s]


Reading Chunk 126 (12600000-12700000)


100%|██████████| 100000/100000 [00:01<00:00, 99675.59it/s]


Reading Chunk 127 (12700000-12800000)


100%|██████████| 100000/100000 [00:00<00:00, 106897.42it/s]


Reading Chunk 128 (12800000-12900000)


100%|██████████| 99779/99779 [00:00<00:00, 116810.58it/s]


Unnamed: 0,session,aid,ts,type
0,0,1517085,2022-07-31 22:00:00.025,0
1,0,1563459,2022-07-31 22:01:44.511,0
2,0,1309446,2022-08-01 15:23:59.426,0
3,0,16246,2022-08-01 15:28:39.997,0
4,0,1781822,2022-08-01 15:31:11.344,0
...,...,...,...,...
216716091,12899776,1737908,2022-08-28 21:59:47.073,0
216716092,12899777,384045,2022-08-28 21:59:36.974,0
216716093,12899777,384045,2022-08-28 21:59:46.800,0
216716094,12899778,561560,2022-08-28 21:59:43.611,0


In [5]:
df_test = create_dataframe(json_file_path=str(dataset_directory / 'test.jsonl'), chunk_size=100000)
df_test

Reading Chunk 0 (0-100000)


100%|██████████| 100000/100000 [00:00<00:00, 256490.16it/s]


Reading Chunk 1 (100000-200000)


100%|██████████| 100000/100000 [00:00<00:00, 242153.61it/s]


Reading Chunk 2 (200000-300000)


100%|██████████| 100000/100000 [00:00<00:00, 234618.03it/s]


Reading Chunk 3 (300000-400000)


100%|██████████| 100000/100000 [00:00<00:00, 219280.20it/s]


Reading Chunk 4 (400000-500000)


100%|██████████| 100000/100000 [00:00<00:00, 230473.87it/s]


Reading Chunk 5 (500000-600000)


100%|██████████| 100000/100000 [00:00<00:00, 216094.64it/s]


Reading Chunk 6 (600000-700000)


100%|██████████| 100000/100000 [00:00<00:00, 212529.84it/s]


Reading Chunk 7 (700000-800000)


100%|██████████| 100000/100000 [00:00<00:00, 206513.62it/s]


Reading Chunk 8 (800000-900000)


100%|██████████| 100000/100000 [00:00<00:00, 198533.40it/s]


Reading Chunk 9 (900000-1000000)


100%|██████████| 100000/100000 [00:00<00:00, 208276.12it/s]


Reading Chunk 10 (1000000-1100000)


100%|██████████| 100000/100000 [00:00<00:00, 202678.90it/s]


Reading Chunk 11 (1100000-1200000)


100%|██████████| 100000/100000 [00:00<00:00, 206633.57it/s]


Reading Chunk 12 (1200000-1300000)


100%|██████████| 100000/100000 [00:00<00:00, 193415.14it/s]


Reading Chunk 13 (1300000-1400000)


100%|██████████| 100000/100000 [00:00<00:00, 203100.91it/s]


Reading Chunk 14 (1400000-1500000)


100%|██████████| 100000/100000 [00:00<00:00, 201796.79it/s]


Reading Chunk 15 (1500000-1600000)


100%|██████████| 100000/100000 [00:00<00:00, 209623.59it/s]


Reading Chunk 16 (1600000-1700000)


100%|██████████| 71803/71803 [00:00<00:00, 223093.91it/s]


Unnamed: 0,session,aid,ts,type
0,12899779,59625,2022-08-28 22:00:00.278,0
1,12899780,1142000,2022-08-28 22:00:00.378,0
2,12899780,582732,2022-08-28 22:00:58.352,0
3,12899780,973453,2022-08-28 22:01:49.199,0
4,12899780,736515,2022-08-28 22:02:16.868,0
...,...,...,...,...
6928118,14571577,1141710,2022-09-04 21:59:34.770,0
6928119,14571578,519105,2022-09-04 21:59:35.009,0
6928120,14571579,739876,2022-09-04 21:59:35.605,0
6928121,14571580,202353,2022-09-04 21:59:41.067,0


In [6]:
print(f'Training Shape: {df_train.shape} - Memory Usage: {df_train.memory_usage().sum() / 1024 ** 2:.2f} MB')
print(f'Test Shape: {df_test.shape} - Memory Usage: {df_test.memory_usage().sum() / 1024 ** 2:.2f} MB')

Training Shape: (216716096, 4) - Memory Usage: 3513.50 MB
Test Shape: (6928123, 4) - Memory Usage: 112.32 MB


In [7]:
%%time

df_train.to_pickle('train.pkl')
df_test.to_pickle('test.pkl')

CPU times: user 1.49 s, sys: 9.15 s, total: 10.6 s
Wall time: 10.7 s
