In [1]:
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
path_input = {}
path_output = {}
for i in ["train1", "valid"]:
    path_input[i] = f"/kaggle/input/otto-make-data-train-test-split/{i}.jsonl"
    path_output[i] = f"label_{i}.parquet"
    
sample_size = 150000

In [3]:
list_cols = ["session", "label_clicks", "label_carts", "label_orders", "label_any", "cutoff"]
for i in ["train1", "valid"]:
    chunks = pd.read_json(path_input[i],lines = True, chunksize = sample_size)
    # listのdictに出力する列を格納する
    dict_cols = {col: [] for col in list_cols}
    for df_chunk in tqdm(chunks):
        for j, session in df_chunk.iterrows():
            event_truth = session["events"][session["cutoff"]:]
            labels = {"clicks": set(), "carts": set(), "orders": set()}
            for event in event_truth:
                if i == "train1":
                    if event["type"] == "clicks" and len(labels["clicks"]) < 10: labels["clicks"].add(event["aid"])
                if i == "valid" and len(labels["clicks"]) == 0:
                    if event["type"] == "clicks": labels["clicks"] = {event["aid"]}
                if event["type"] == "carts": labels["carts"].add(event["aid"])
                if event["type"] == "orders": labels["orders"].add(event["aid"])
            labels["any"] = labels["clicks"] | labels["carts"] | labels["orders"]
            dict_cols["session"].append(session["session"])
            dict_cols["label_clicks"].append(labels["clicks"])
            dict_cols["label_carts"].append(labels["carts"])
            dict_cols["label_orders"].append(labels["orders"])
            dict_cols["label_any"].append(labels["any"])
            dict_cols["cutoff"].append(session["cutoff"])
                    
    df_truth = pd.DataFrame({col: dict_cols[col] for col in list_cols})
    df_truth.to_parquet(path_output[i])
    display(df_truth)

0it [00:00, ?it/s]

Unnamed: 0,session,label_clicks,label_carts,label_orders,label_any,cutoff
0,8643220,{1189975},{},{},{1189975},1
1,8643221,{111691},{},{},{111691},1
2,8643222,"{1104129, 1772730}",{},{},"{1104129, 1772730}",9
3,8643223,{1742315},{},{},{1742315},3
4,8643224,{471589},{},"{1502227, 631899, 1063371, 945574}","{1502227, 471589, 1063371, 945574, 631899}",30
...,...,...,...,...,...,...
2237920,11098472,{},{},{},{},8
2237921,11098478,{261405},{},{},{261405},1
2237922,11098483,{46115},{},{},{46115},1
2237923,11098494,{1733630},{},{},{1733630},1


0it [00:00, ?it/s]

Unnamed: 0,session,label_clicks,label_carts,label_orders,label_any,cutoff
0,11098528,{796572},{},"{92401, 1462506, 1561739, 950341}","{92401, 950341, 1462506, 1561739, 796572}",20
1,11098529,{1298277},{},{},{1298277},2
2,11098530,{409236},{409236},{409236},{409236},2
3,11098531,{1271998},{},"{1365569, 396199, 1728212, 452188, 1271998}","{1365569, 1728212, 396199, 452188, 1271998}",16
4,11098532,{463529},{},{},{463529},12
...,...,...,...,...,...,...
1778712,12899774,{1399483},{},{},{1399483},1
1778713,12899775,{1760714},{},{},{1760714},1
1778714,12899776,{1737908},{},{},{1737908},1
1778715,12899777,{384045},{},{},{384045},1


In [4]:
df_truth["num_cart"] = df_truth["label_carts"].apply(len)

In [5]:
df_truth.query("num_cart > 0")

Unnamed: 0,session,label_clicks,label_carts,label_orders,label_any,cutoff,num_cart
2,11098530,{409236},{409236},{409236},{409236},2,1
5,11098533,{234138},"{129259, 1328174, 1536846, 702134, 1233050}","{1233050, 652916, 1189919}","{129259, 1536846, 1328174, 652916, 702134, 234...",176,5
6,11098534,{1342293},{223062},{223062},"{1342293, 223062}",8,1
8,11098536,{1539309},{649909},{},"{1539309, 649909}",1,1
9,11098537,{1189975},"{1409748, 358965}",{1409748},"{1409748, 358965, 1189975}",5,2
...,...,...,...,...,...,...,...
1778615,12899676,{},{35328},{},{35328},2,1
1778625,12899686,{1627951},{1627951},{},{1627951},1,1
1778652,12899713,{1097818},{1097818},{},{1097818},1,1
1778670,12899732,{1126169},{1126169},{},{1126169},1,1
