# Setting

In [1]:
import os
from collections import defaultdict, Counter
from typing import List, Dict, Union
import pickle

from tqdm import tqdm
import numpy as np
import polars as pl
from gensim.models import Word2Vec

from scripts.metrics import map_at_k



In [2]:
INPUT_DIR = "../../input/raw/"
OUTPUT_DIR = "./candidates/"

In [3]:
TOP_N = 5
SEED = 42

In [4]:
def explode_and_add_seq_no(df:pl.DataFrame) -> pl.DataFrame:
    df = df.explode(["prev_items"])
    df = df.with_columns(
        df.select(pl.col("session_id").cumcount().over("session_id").alias("seq_no").cast(pl.Int64))
    )
    return df

In [5]:
def train_word2vec(df:pl.DataFrame) -> Word2Vec:

    # sessionごとにaidのシーケンスを作成
    aid_sequences = list(df.group_by("session_id", maintain_order=True).all()["yad_no"].to_list())

    # word2vecの学習
    model = Word2Vec(
        sentences=aid_sequences,
        epochs=50,
        vector_size=100,
        window=10,
        ns_exponent=-0.5,
        workers=8,
        min_count=1,
        seed=SEED,
    )

    return model

In [17]:
def make_nns_matrix(w2v, k):
    aid_xs = []
    aid_ys = []
    sims = []
    for aid in tqdm(w2v.wv.index_to_key):
        nns = w2v.wv.most_similar(aid, topn=k)
        aid_y = [x[0] for x in nns]
        sim = [x[1] for x in nns]
        aid_xs.extend([aid] * k)
        aid_ys.extend(aid_y)
        sims.extend(sim)
    df = pl.DataFrame({"yad_no": aid_xs, "candidate_yad_no": aid_ys, 'i2v_similality': sims})
    
    # rankを計算
    df = df.with_columns(
        pl.col("i2v_similality").rank(descending=True).over("yad_no").alias("i2v_rank")
    ).drop("i2v_similality")

    return df

# For local train/eval

In [18]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
test_log = pl.read_csv(os.path.join(INPUT_DIR, "test_log.csv"))

In [19]:
log = pl.concat([train_log, test_log], how="vertical")

In [20]:
# train and save word2vec model
model = train_word2vec(log)
model.save("./features/item2vec_for_train_or_eval.model")

In [21]:
nns_matrix = make_nns_matrix(model, TOP_N)

100% 13562/13562 [00:10<00:00, 1305.64it/s]


In [22]:
nns_matrix.write_parquet(os.path.join(OUTPUT_DIR, "item2vec_for_train_or_eval.parquet"))

# MAP@k=10

In [23]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
train_label = pl.read_csv(os.path.join(INPUT_DIR, "train_label.csv")).rename({"yad_no":"label_yad_no"})

In [24]:
last_items = train_log.group_by("session_id").last()

In [25]:
nns_matrix = pl.read_parquet(os.path.join(OUTPUT_DIR, "item2vec_for_train_or_eval.parquet"))

In [26]:
prediction = last_items \
    .join(nns_matrix, on="yad_no", how="left") \
    .join(train_label, on="session_id", how="left") \
    .sort(["session_id", "i2v_rank"], descending=[False, False]) \
    .with_columns((pl.col("candidate_yad_no") == pl.col("label_yad_no")).cast(pl.Int8).alias("user_relevance")) \
    .fill_null(0)

In [36]:
prediction

session_id,seq_no,yad_no,candidate_yad_no,i2v_rank,label_yad_no,user_relevance
str,i64,i64,i64,f64,i64,i8
"""000007603d533d…",0,2395,11882,1.0,4101,0
"""000007603d533d…",0,2395,2808,2.0,4101,0
"""000007603d533d…",0,2395,4101,3.0,4101,1
"""000007603d533d…",0,2395,3324,4.0,4101,0
"""000007603d533d…",0,2395,5289,5.0,4101,0
"""0000ca043ed437…",0,13535,8253,1.0,8253,1
"""0000ca043ed437…",0,13535,8747,2.0,8253,0
"""0000ca043ed437…",0,13535,6516,3.0,8253,0
"""0000ca043ed437…",0,13535,2259,4.0,8253,0
"""0000ca043ed437…",0,13535,4488,5.0,8253,0


In [27]:
user_relevances = prediction.group_by("session_id", maintain_order=True).all()["user_relevance"].to_list()

In [28]:
map_at_k(user_relevances, 10)

0.12351378718707202

# For test

In [29]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
train_label = pl.read_csv(os.path.join(INPUT_DIR, "train_label.csv"))
test_log = pl.read_csv(os.path.join(INPUT_DIR, "test_log.csv"))

In [30]:
# trainのlabelをlogにappendする

prev_items_list = (
    train_log
    .sort(["session_id", "seq_no"])
    .group_by("session_id", maintain_order=True)
    .agg(pl.col("yad_no"))
)["yad_no"].to_list()

next_item_list = (
    train_label
    .sort("session_id")
)["yad_no"].to_list()

prev_items_list_updated = []
for prev_items, next_item in zip(prev_items_list, next_item_list):
    prev_items.append(next_item)
    prev_items_list_updated.append(prev_items)

train_log = train_label.with_columns(
    pl.Series(name="prev_items", values=prev_items_list_updated)
)

train_log = explode_and_add_seq_no(train_log) \
    .drop("yad_no") \
    .rename({"prev_items" : "yad_no"}) \
    [["session_id", "seq_no", "yad_no"]] # カラム並び替え

In [31]:
log = pl.concat([train_log, test_log], how="vertical")

In [32]:
# train and save word2vec model
model = train_word2vec(log)
model.save("./features/item2vec_for_test.model")

In [33]:
nns_matrix = make_nns_matrix(model, TOP_N)

100% 13806/13806 [00:08<00:00, 1694.83it/s]


In [34]:
nns_matrix.write_parquet(os.path.join(OUTPUT_DIR, "item2vec_for_test.parquet"))

In [35]:
nns_matrix.head()

yad_no,candidate_yad_no,i2v_rank
i64,i64,f64
12350,915,1.0
12350,3338,2.0
12350,3137,3.0
12350,5157,4.0
12350,4169,5.0
