In [1]:
import pandas as pd
import numpy as np
import json
from tqdm.notebook import tqdm
import random
random.seed(42)

from scipy.sparse import lil_matrix

import gc

import implicit

In [2]:
path_input = {}
path_input_label = {}
path_output = {}
for i in ["train1", "valid"]:
    path_input[i] = f"/kaggle/input/otto-make-data-train-test-split/{i}.jsonl"
    path_input_label[i] = f"/kaggle/input/otto-make-data-label/label_{i}.parquet"
    path_output[i] = f"candidate_{i}.jsonl"
    
sample_size = 150000
num_top_item = 500000
event_type = "clicks"
alpha = 500

debug = False# 1つ分のchunkで動作確認するときTrue
if debug: num_top_item = 100

In [3]:
# aidごとのevent数をカウントする
for i in ["train1"]:
    dict_count = {"clicks": {}, "carts": {}, "orders": {}}
    chunks = pd.read_json(path_input[i],lines = True, chunksize = sample_size)
    for df_chunk in tqdm(chunks):
        for j, session in df_chunk.iterrows():
            list_event = session["events"][:session["cutoff"]] # cutoff以前のeventのみ使える
            for event in list_event:
                dict_count[event["type"]][event["aid"]] = dict_count[event["type"]].get(event["aid"], 0) + 1
    # event_typeごとに回数の多い順にソートしてkeyだけ取り出す
    list_rank = {}
    for types in ["clicks", "carts", "orders"]:
        list_rank[types] = [j[0] for j in sorted(dict_count[types].items(), key = lambda x : x[1], reverse = True)]
    # order数上位のaidのリスト
    list_aid_rank = list_rank[event_type][:num_top_item]
    set_aid_rank = set(list_aid_rank)      
    
    # list_aidのorder履歴のあるsessionを記録する
    list_session = []
    set_aid = set()
    chunks = pd.read_json(path_input[i], lines = True, chunksize = sample_size)
    for df_chunk in tqdm(chunks):
        for j, session in df_chunk.iterrows():
            session = session.to_dict()
            list_event = session["events"][:session["cutoff"]]
            list_aid_ses = [event["aid"] for event in list_event if event["aid"] in set_aid_rank and event["type"] == event_type]
            if len(list_aid_ses) > 0: # 1つ以上orderがあるsessionのみ保持する
                list_session.append(session["session"])
                set_aid |= set(list_aid_ses)
    list_aid = list(set_aid)
    set_session = set(list_session)
    print(i, len(list_aid))
    print(i, len(list_session))
    
    # idとindexの変換dictを作成する
    session2idx = dict(zip(list_session, range(len(list_session))))
    aid2idx = dict(zip(list_aid, range(len(list_aid))))
    # 空の評価値行列を作成する
    mat = lil_matrix((len(list_aid),len(list_session)))
    # 評価値行列を作成する
    chunks = pd.read_json(path_input[i],lines = True, chunksize = sample_size)
    for df_chunk in tqdm(chunks):
        for j, session in df_chunk.iterrows():
            list_event = session["events"][:session["cutoff"]] # cutoff以前のeventのみ使える
            for event in list_event:
                if event["type"] == event_type and event["aid"] in set_aid and session["session"] in set_session:
                    mat[
                        aid2idx[event["aid"]], # aid に対応する行
                        session2idx[session["session"]]  # sessionに対応する列
                    ] = 1 * alpha
    model = implicit.als.AlternatingLeastSquares(
        factors = 600,
        iterations = 50,
        calculate_training_loss = True,
        random_state = 46,
    )
    model.fit(mat.tocsr())
    recom = model.recommend_all(mat.T, N = 20)
    pred = dict()
    for session, session_idx in tqdm(session2idx.items()):
        list_aid_idx = recom[session_idx,:]
        pred[session] = []
        for aid_idx in list_aid_idx:
            aid = list_aid[aid_idx]
            pred[session].append(aid)
    for k,j in enumerate(pred.keys()):
        print(j,pred[j])
        if k == 2: break
    pd.to_pickle(pred, f"imf_rules_{event_type}_{i}.pickle")
    pd.to_pickle(session2idx, f"session2idx_{event_type}_{i}.pickle")
    pd.to_pickle(mat.T.tocsr(), f"user_items_{event_type}_{i}.pickle")
    pd.to_pickle(model, f"model_{event_type}_{i}.pickle")
    pd.to_pickle(list_aid, f"list_aid_{event_type}_{i}.pickle")
    pd.to_pickle(list_session, f"list_session_{event_type}_{i}.pickle")

    
    # カバー率を計算
    df_label = pd.read_parquet(path_input_label[i])
    n_all, n_hit= 0,0
    for ses, label in tqdm(zip(df_label["session"],df_label[f"label_{event_type}"])):
        n_all += len(label)
        pred_ses = pred.get(ses, [])
        n_hit += sum([1 for x in pred_ses if x in label])

    print(f"{i}: {n_hit/n_all}")


0it [00:00, ?it/s]

0it [00:00, ?it/s]

train1 500000
train1 2147495


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/2147495 [00:00<?, ?it/s]

  0%|          | 0/2147495 [00:00<?, ?it/s]

8643220 [399315, 1768884, 1308823, 1405280, 1189975, 406885, 1710336, 1337750, 956562, 1148482, 226746, 1229555, 1590709, 1327520, 749023, 1797010, 1368814, 1744125, 408083, 476200]
8643221 [1202093, 585708, 867423, 6803, 1797158, 1033849, 578092, 1229501, 1528935, 1543291, 476042, 408243, 1190772, 1004996, 380483, 1680498, 178786, 35774, 1680116, 460906]
8643222 [1414418, 1809002, 542022, 1581056, 299338, 1070151, 1397070, 1476510, 121184, 1309666, 317032, 321920, 1251424, 816301, 1464271, 1601939, 1473902, 1208169, 208430, 1495832]


0it [00:00, ?it/s]

train1: 0.12695911219212871
