In [None]:
import numpy as np
import pandas as pd

In [None]:
id2type = ['clicks', 'carts', 'orders'] # I have analyzed the data
                                          # and so I know we can expect these event types
type2id = {a: i for i, a in enumerate(id2type)}

id2type, type2id

In [None]:
pd.to_pickle(id2type, 'id2type.pkl')
pd.to_pickle(type2id, 'type2id.pkl')

In [None]:
def jsonl_to_df(fn):
    sessions = []
    aids = []
    tss = []
    types = []

    chunks = pd.read_json(fn, lines=True, chunksize=100_000)

    for chunk in chunks:
        for row_idx, session_data in chunk.iterrows():
            num_events = len(session_data.events)
            sessions += ([session_data.session] * num_events)
            for event in session_data.events:
                aids.append(event['aid'])
                tss.append(event['ts'])
                types.append(type2id[event['type']])
        
    return pd.DataFrame(data={'session': sessions, 'aid': aids, 'ts': tss, 'type': types})

In [None]:
%%time

train_df = jsonl_to_df('data/train.jsonl')
train_df.type = train_df.type.astype(np.uint8) # a tiny bit of further memory footprint optimization
train_df.to_parquet('train.parquet', index=False)
train_df.to_csv('train.csv', index=False)

del train_df

In [None]:
%%time

test_df = jsonl_to_df('data/test.jsonl')
test_df.type = test_df.type.astype(np.uint8)
test_df.to_parquet('test.parquet', index=False)
test_df.to_csv('test.csv', index=False)