In [1]:
import pandas as pd
import numpy as np
import json
from tqdm.notebook import tqdm
import random
random.seed(42)

from scipy.sparse import lil_matrix

import gc

import implicit

In [2]:
path_input = {}
path_input_label = {}
path_output = {}
for i in ["train3", "train2", "train1", "valid"]:
    path_input[i] = f"/kaggle/input/otto-make-data-train-test-split/{i}.jsonl"
    path_input_label[i] = f"/kaggle/input/otto-make-data-label/label_{i}.parquet"
    path_output[i] = f"candidate_{i}.jsonl"
    
sample_size = 150000
num_top_item = 500000
event_type = ["clicks", "carts"]

debug = False # 1つ分のchunkで動作確認するときTrue
if debug: num_top_item = 100

In [3]:
# aidごとのevent数をカウントする
for i in ["train1"]:
    dict_count = {}
    chunks = pd.read_json(path_input[i],lines = True, chunksize = sample_size)
    for df_chunk in tqdm(chunks):
        for j, session in df_chunk.iterrows():
            list_event = session["events"][:session["cutoff"]] # cutoff以前のeventのみ使える
            for event in list_event:
                if event["type"] in event_type:
                    dict_count[event["aid"]] = dict_count.get(event["aid"], 0) + 1
    # event_typeごとに回数の多い順にソートしてkeyだけ取り出す
    list_rank = [j[0] for j in sorted(dict_count.items(), key = lambda x : x[1], reverse = True)]
    # order数上位のaidのリスト
    list_aid_rank = list_rank[:num_top_item]
    set_aid_rank = set(list_aid_rank)       
    # list_aidのorder履歴のあるsessionを記録する
    list_session = []
    set_aid = set()
    chunks = pd.read_json(path_input[i], lines = True, chunksize = sample_size)
    for df_chunk in tqdm(chunks):
        for j, session in df_chunk.iterrows():
            session = session.to_dict()
            list_event = session["events"][:session["cutoff"]]
            list_aid_ses = [event["aid"] for event in list_event if event["aid"] in set_aid_rank and event["type"] in event_type]
            if len(list_aid_ses) > 0: # 1つ以上orderがあるsessionのみ保持する
                list_session.append(session["session"])
                set_aid |= set(list_aid_ses)
    list_aid = list(set_aid)
    set_session = set(list_session)
    print(i, len(list_aid))
    print(i, len(list_session))
    
    # idとindexの変換dictを作成する
    session2idx = dict(zip(list_session, range(len(list_session))))
    aid2idx = dict(zip(list_aid, range(len(list_aid))))
    # 空の評価値行列を作成する
    mat = lil_matrix((len(list_aid),len(list_session)))
    # 評価値行列を作成する
    chunks = pd.read_json(path_input[i],lines = True, chunksize = sample_size)
    for df_chunk in tqdm(chunks):
        for j, session in df_chunk.iterrows():
            list_event = session["events"][:session["cutoff"]] # cutoff以前のeventのみ使える
            for event in list_event:
                if event["type"] in event_type and event["aid"] in set_aid and session["session"] in set_session:
                    mat[
                        aid2idx[event["aid"]], # aid に対応する行
                        session2idx[session["session"]]  # sessionに対応する列
                    ] = 1
    model = implicit.bpr.BayesianPersonalizedRanking(
        factors = 200,
        iterations = 7000,
        learning_rate = 0.01,
    )
    model.fit(mat.tocsr())
    recom = model.recommend_all(mat.T, N = 20)
    pred = dict()
    for session, session_idx in tqdm(session2idx.items()):
        list_aid_idx = recom[session_idx,:]
        pred[session] = []
        for aid_idx in list_aid_idx:
            aid = list_aid[aid_idx]
            pred[session].append(aid)
    for k,j in enumerate(pred.keys()):
        print(j,pred[j])
        if k == 2: break
    pd.to_pickle(pred, f"bpr_rules_{'x'.join(event_type)}_{i}.pickle")
    pd.to_pickle(session2idx, f"session2idx_{'x'.join(event_type)}_{i}.pickle")
    pd.to_pickle(mat.T.tocsr(), f"user_items_{'x'.join(event_type)}_{i}.pickle")
    pd.to_pickle(model, f"model_{'x'.join(event_type)}_{i}.pickle")
    pd.to_pickle(list_aid, f"list_aid_{'x'.join(event_type)}_{i}.pickle")
    pd.to_pickle(list_session, f"list_session_{'x'.join(event_type)}_{i}.pickle")


    
    if i != "test":
        # カバー率を計算
        for etype in event_type:
            df_label = pd.read_parquet(path_input_label[i])
            n_all, n_hit= 0,0
            for ses, label in tqdm(zip(df_label["session"],df_label[f"label_{etype}"])):
                n_all += len(label)
                pred_ses = pred.get(ses, [])
                n_hit += sum([1 for x in pred_ses if x in label])

            print(f"{i}, {etype}: {n_hit/n_all}")
    if debug: break

0it [00:00, ?it/s]

0it [00:00, ?it/s]

train1 500000
train1 2146834


0it [00:00, ?it/s]

  0%|          | 0/7000 [00:00<?, ?it/s]

  0%|          | 0/2146834 [00:00<?, ?it/s]

  0%|          | 0/2146834 [00:00<?, ?it/s]

8643220 [399315, 1308823, 1768884, 1283505, 1337750, 476200, 934527, 935968, 1171383, 1468416, 1405280, 406885, 923865, 1409748, 199994, 994478, 1710336, 1744125, 1579489, 663472]
8643221 [541674, 585708, 1190772, 532791, 599132, 1021020, 201963, 135162, 1543291, 380483, 217517, 1734557, 1202093, 1563400, 1786521, 1093709, 536431, 1228552, 157926, 68470]
8643222 [1414418, 542022, 955008, 1809002, 944186, 1397070, 1799121, 299338, 868805, 996165, 1809189, 1581056, 451807, 455725, 139945, 1070151, 278320, 1660609, 1476510, 979441]


0it [00:00, ?it/s]

train1, clicks: 0.11438932790120641


0it [00:00, ?it/s]

train1, carts: 0.07217388663660244
