In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import os
import gc
import random
from collections import defaultdict, Counter
from typing import List, Dict
import joblib
import pickle

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import polars as pl
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

In [4]:
VER = "11"
LOCALES = ["IT", "ES", "FR"]

In [5]:
train = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task2/train_task2_augmented.parquet")
test = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task2/test_task2_leftover.parquet")
product = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/common/product.parquet")

In [6]:
def preprocess(session_df: pl.DataFrame, product_df: pl.DataFrame) -> pl.DataFrame:
    session_df = session_df.explode("prev_items")
    session_df = session_df.join(product_df, left_on=["prev_items", "locale"], right_on=["id", "locale"], how="left")
    return session_df

In [7]:
def generate_product_features(session_df: pl.DataFrame, product_df: pl.DataFrame):
    product_features_df = product_df[["id", "brand", "locale", "price"]]

    # アイテムのprice
    product_features_df = product_features_df.rename({"price":"P_price"})

    # そのアイテムがいくつのlocaleで展開されているか
    product_features_df = product_features_df.join(
        product_df.groupby("id").count().rename({"count":"P_n_unique_locale"}),
        on="id",
        how="left"
    )

    # アイテムの購入回数
    product_features_df = product_features_df.join(
        session_df.groupby(["prev_items", "locale"]).count().rename({"count":"P_purchase_count"}),
        left_on=["id", "locale"],
        right_on=["prev_items", "locale"],
        how="left"
    )

    # アイテムの購入回数（グローバル）
    product_features_df = product_features_df.join(
        session_df.groupby(["prev_items"]).count().rename({"count":"P_purchase_count_global"}),
        left_on=["id"],
        right_on=["prev_items"],
        how="left"
    )

    # 商品の購入回数×商品の値段
    product_features_df = product_features_df.with_columns(
        (pl.col("P_price") * pl.col("P_purchase_count")).alias("P_total_amount")
    )

    # 各項目の非null
    is_null_df = product_df.with_columns(
        pl.col("color").is_null().cast(pl.UInt8).alias("P_is_color_null"),
        pl.col("size").is_null().cast(pl.UInt8).alias("P_is_size_null"),
        pl.col("model").is_null().cast(pl.UInt8).alias("P_is_model_null"),
        pl.col("material").is_null().cast(pl.UInt8).alias("P_is_material_null"),
        pl.col("author").is_null().cast(pl.UInt8).alias("P_is_author_null"),
    )[["id", "locale", "P_is_color_null", "P_is_size_null", "P_is_model_null", "P_is_material_null", "P_is_author_null"]]
    product_features_df = product_features_df.join(is_null_df, on=["id", "locale"], how="left")

    # そのアイテムが属するブランドの購入回数
    product_features_df = product_features_df.join(
        session_df.groupby(["brand", "locale"]).count().rename({"count":"P_brand_purchase_count"}),
        on=["brand", "locale"],
        how="left",
    )

    # そのアイテムが属するブランドの購入回数（グローバル）
    product_features_df = product_features_df.join(
        session_df.groupby(["brand"]).count().rename({"count":"P_brand_purchase_count_global"}),
        on=["brand"],
        how="left",
    )

    # そのアイテムが属するブランドの合計購入金額
    product_features_df = product_features_df.join(
        session_df.groupby(["brand"]).agg(pl.col("price").sum().alias("P_total_brand_amount")),
        on=["brand"],
        how="left",
    )

    # その商品が所属するアイテムのブランドの価格のpriceの平均、最大、最小、標準偏差
    product_features_df = product_features_df.join(
        product_df.groupby(["locale", "brand"]).agg(
            pl.col("price").mean().alias("P_brand_mean_price"),
            pl.col("price").max().alias("P_brand_max_price"),
            pl.col("price").min().alias("P_brand_min_price"),
            pl.col("price").std().alias("P_brand_std_price"),
        ),
        on=["locale", "brand"],
        how="left"
    )

    # その商品の値段とブランドの平均価格との差
    product_features_df = product_features_df.with_columns(
        (pl.col("P_price") - pl.col("P_brand_mean_price")).alias("P_price_diff_to_avg_brand_price")
    )

    # そのアイテムが属するロケールの購入回数
    product_features_df = product_features_df.join(
        session_df.groupby(["locale"]).count().rename({"count":"P_locale_purchase_count"}),
        on=["locale"],
        how="left",
    )

    # そのアイテムが属するロケールの合計購入金額
    product_features_df = product_features_df.join(
        session_df.groupby(["locale"]).agg(pl.col("price").sum().alias("P_total_locale_amount")),
        on=["locale"],
        how="left",
    )

    # その商品の売上量・売上金額 vs グループの売上量・売上金額
    product_features_df = product_features_df.with_columns([
        (pl.col("P_purchase_count") / (pl.col("P_locale_purchase_count") + 1)).alias("P_purchase_count_ratio_to_locale"),
        (pl.col("P_total_amount") / (pl.col("P_total_locale_amount") + 1)).alias("P_purchase_amount_ratio_to_locale"),
        (pl.col("P_purchase_count") / (pl.col("P_brand_purchase_count") + 1)).alias("P_purchase_count_ratio_to_brand"),
        (pl.col("P_total_amount") / (pl.col("P_total_brand_amount") + 1)).alias("P_purchase_amount_ratio_to_brand"),
    ])
   
    return product_features_df

# train/eval

In [8]:
session_df = pl.concat([
    train[["prev_items", "locale"]],
    test[["prev_items", "locale"]],
])

In [None]:
session_df = preprocess(session_df, product)

In [None]:
product_features = generate_product_features(session_df, product)

In [None]:
product_features.write_parquet(f"/gdrive/MyDrive/amazon_kdd_2023/data/interim/features/task2/product_feature_train_{VER}.parquet", use_pyarrow=True)

# test

In [None]:
# trainのnext_itemをprev_itemsにappendする
prev_items_list = train["prev_items"].to_list()
next_item_list = train["next_item"].to_list()
prev_items_list_updated = []
for prev_items, next_item in zip(prev_items_list, next_item_list):
    prev_items.append(next_item)
    prev_items_list_updated.append(prev_items)
train = train.with_columns(
    pl.Series(name="prev_items", values=prev_items_list_updated)
)

In [None]:
session_df = pl.concat([
    train[["prev_items", "locale"]],
    test[["prev_items", "locale"]],
])

In [None]:
session_df = preprocess(session_df, product)

In [None]:
product_features = generate_product_features(session_df, product)

In [None]:
product_features.write_parquet(f"/gdrive/MyDrive/amazon_kdd_2023/data/interim/features/task2/product_feature_test_{VER}.parquet", use_pyarrow=True)