In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import polars as pl

In [4]:
VER = 27
DIR = "/gdrive/MyDrive/amazon_kdd_2023/"
LOCALES = ["IT", "FR", "ES"]
TOP_N = 50

# Generate candidates

In [5]:
def generate_same_brand_popular_items(product, product_feat):
    # ロケールで絞り込み
    product = product[["id", "locale", "brand"]]
    product = product.filter(pl.col("locale").is_in(LOCALES))

    # 売上回数を結合
    product = product.join(product_feat[["id", "locale", "P_purchase_count"]], on=["id", "locale"], how="left")
    product = product.fill_null(0)

    # 同じブランドで人気の商品TOP_Nを候補にする
    df = product.join(product, on=["locale", "brand"])
    df = df.filter(
        pl.col("id") != pl.col("id_right")
    )
    df = df.sort(by=["id", "locale", "P_purchase_count_right"], descending=[False, False, True])
    df = df.rename({
        "id":"item",
        "id_right":"candidate_item"
    })
    df = df.groupby(["item", "locale"], maintain_order=True).head(TOP_N)
    df = df.with_columns(
        pl.col("P_purchase_count_right").rank(descending=True, method="min").over(["item", "locale"]).alias("same_brand_rank")
    )
    df = df[["item", "locale", "candidate_item", "same_brand_rank"]]

    return df

## for train

In [6]:
product = pl.read_parquet(DIR + "data/preprocessed/common/product.parquet")
product_feat_train = pl.read_parquet(DIR + "data/interim/features/task2/product_feature_train_12.parquet")

In [7]:
df = generate_same_brand_popular_items(product, product_feat_train)

In [8]:
file_name = f"similar_products_{VER}_for_train_or_eval.parquet"
df.write_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/interim/candidates/task2/" + file_name)

In [9]:
df.head()

item,locale,candidate_item,same_brand_rank
str,str,str,u32
"""0007477155""","""IT""","""0008307733""",1
"""0008307733""","""IT""","""0007477155""",1
"""0008402787""","""ES""","""8491392289""",1
"""0008402787""","""IT""","""B07CKHS8J1""",1
"""0194031373""","""IT""","""0194031381""",1


## MRR@100

In [10]:
train = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task2/train_task2.parquet")

In [11]:
# last_itemの抽出
last_item_list = []
prev_items_list = train["prev_items"].to_list()
for prev_items in prev_items_list:
    last_item_list.append(prev_items[-1])
train = train.with_columns(pl.Series(name="last_item", values=last_item_list))

In [12]:
train = train[["session_id", "last_item", "next_item", "locale"]]

In [13]:
candidates = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/interim/candidates/task2/" + file_name)

In [14]:
# candidateの結合とlabelの付与
df = train.join(candidates, left_on=["last_item", "locale"], right_on=["item", "locale"], how="left")
df = df.with_columns((pl.col("candidate_item") == pl.col("next_item")).cast(pl.Int8).alias("label"))
label_lists = df.groupby("session_id", maintain_order=True).all()["label"].to_list()

In [15]:
# MRRの計算
rr = 0
for labels in label_lists:
    labels = labels[:100]
    for i, label in enumerate(labels):
        if label == 1:
            rr += 1 / (i+1)
            break
mrr = rr / len(label_lists)
print("MRR:", round(mrr, 5))

MRR: 0.13164


## for inference

In [16]:
product = pl.read_parquet(DIR + "data/preprocessed/common/product.parquet")
product_feat_test = pl.read_parquet(DIR + "data/interim/features/task2/product_feature_test_12.parquet")

In [17]:
df = generate_same_brand_popular_items(product, product_feat_test)

In [18]:
file_name = f"similar_products_{VER}_for_inference.parquet"
df.write_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/interim/candidates/task2/" + file_name)

In [19]:
df.head()

item,locale,candidate_item,same_brand_rank
str,str,str,u32
"""0007477155""","""IT""","""0008307733""",1
"""0008307733""","""IT""","""0007477155""",1
"""0008402787""","""ES""","""8491392289""",1
"""0008402787""","""IT""","""B07CKHS8J1""",1
"""0194031373""","""IT""","""0194031381""",1
