In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import concurrent.futures
import pickle

from tqdm import tqdm
import numpy as np
import polars as pl
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cosine

In [4]:
LOCALES = ["FR", "IT", "ES"]
TOP_N = 300
VER = "17"

# Generate candidates

In [None]:
product = pd.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/common/product.parquet")

In [None]:
def calculate_similar_items(product_id):
    # クエリ商品のベクトル取得
    query_vec = tfidf_matrix[df.index[df['id'] == product_id]][0]

    # 類似度を計算し、降順にソート
    similarity = cosine_similarity(tfidf_matrix, query_vec).flatten()
    similarity_scores = pd.Series(similarity, index=df.index)
    similarity_scores = similarity_scores.sort_values(ascending=False)

    # 類似度が高い上位を抽出
    similar_items = similarity_scores.iloc[1:TOP_N+1]
    similar_items_ids = df.loc[similar_items.index]["id"].values
    similar_items_scores = similar_items.values

    return (product_id, similar_items_ids, similar_items_scores)

for locale in LOCALES:
    print(f"start {locale}...")
    locales, items, candidate_items, similality_scores = [], [], [], []
    
    # localeで絞る
    df = product[product["locale"] == locale].reset_index()
    product_ids = df["id"].to_list()

    # TF-IDFベクトル化器を作成
    tfidf = TfidfVectorizer()

    # 商品情報を結合したテキストデータを作成
    text_data = df["title"].fillna("") + " " + df["color"].fillna("") + " " + df["size"].fillna("") + " " + \
                df["model"].fillna("") + " " + df["material"].fillna("") + " " + df["author"].fillna("") + " " + \
                df["brand"].fillna("") + " " + df["desc"].fillna("")

    # TF-IDF行列を作成
    tfidf_matrix = tfidf.fit_transform(text_data)

    with concurrent.futures.ProcessPoolExecutor() as executor:
        results = list(tqdm(executor.map(calculate_similar_items, product_ids), total=len(product_ids)))

    for result in results:
        product_id, similar_items_ids, similar_items_scores = result
        locales.extend([locale] * TOP_N)
        items.extend([product_id] * TOP_N)
        candidate_items.extend(list(similar_items_ids))
        similality_scores.extend(list(similar_items_scores))

    # 結果をデータフレームに変換
    similar_products = pd.DataFrame({
        "locale": locales,
        "item": items,
        "candidate_item": candidate_items,
        "similarity_score": similality_scores,
    })

    # rank付与
    similar_products = pl.from_pandas(similar_products)
    similar_products = similar_products \
    .sort(["item", "similarity_score"], descending=[False, True]) \
    .with_columns(pl.col("similarity_score").rank(descending=True, method="min").over("item").alias("similarity_rank"))

    # candidateのファイル出力
    file_name = f"similar_products_{VER}_{locale}.parquet"
    similar_products.write_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/interim/candidates/task2/" + file_name)

start FR...


100%|██████████| 44577/44577 [05:16<00:00, 140.65it/s]


start IT...


100%|██████████| 50461/50461 [06:23<00:00, 131.45it/s]


start ES...


100%|██████████| 42503/42503 [04:47<00:00, 147.90it/s]


# MRR@100

In [5]:
train = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task2/train_task2.parquet")

In [6]:
candidates = pl.concat([
    pl.read_parquet(f"/gdrive/MyDrive/amazon_kdd_2023/data/interim/candidates/task2/similar_products_{VER}_FR.parquet"),
    pl.read_parquet(f"/gdrive/MyDrive/amazon_kdd_2023/data/interim/candidates/task2/similar_products_{VER}_IT.parquet"),
    pl.read_parquet(f"/gdrive/MyDrive/amazon_kdd_2023/data/interim/candidates/task2/similar_products_{VER}_ES.parquet"),
])

In [7]:
candidates = candidates.filter(pl.col("similarity_score") != 0)
candidates.write_parquet(f"/gdrive/MyDrive/amazon_kdd_2023/data/interim/candidates/task2/similar_products_{VER}.parquet")

In [8]:
# last_itemの抽出
last_item_list = []
prev_items_list = train["prev_items"].to_list()
for prev_items in prev_items_list:
    last_item_list.append(prev_items[-1])
train = train.with_columns(pl.Series(name="last_item", values=last_item_list))

In [9]:
train = train[["locale", "session_id", "last_item", "next_item"]]

In [10]:
# candidateの結合とlabelの付与
df = train.join(candidates, left_on=["last_item", "locale"], right_on=["item", "locale"], how="left")
df = df.with_columns((pl.col("candidate_item") == pl.col("next_item")).cast(pl.Int8).alias("label"))
label_lists = df.groupby("session_id", maintain_order=True).all()["label"].to_list()

In [11]:
# MRRの計算
rr = 0
for labels in label_lists:
    labels = labels[:100]
    for i, label in enumerate(labels):
        if label == 1:
            rr += 1 / (i+1)
            break
mrr = rr / len(label_lists)
print("MRR:", round(mrr, 5))

MRR: 0.25888
