In [1]:
import os

import polars as pl

In [2]:
INPUT_DIR = "../../input/raw/"
OUTPUT_DIR = "./features/"

# アイテム特徴量

In [3]:
def explode_and_add_seq_no(df:pl.DataFrame) -> pl.DataFrame:
    df = df.explode(["prev_items"])
    df = df.with_columns(
        df.select(pl.col("session_id").cumcount().over("session_id").alias("seq_no").cast(pl.Int64))
    )
    return df

In [4]:
def generate_product_features(item_df: pl.DataFrame, log_df: pl.DataFrame, full_log_df: pl.DataFrame):

    log_df = log_df.join(item_df, on="yad_no", how="left")
    full_log_df = full_log_df.join(item_df, on="yad_no", how="left")
    location_cds = ["wid_cd", "ken_cd", "lrg_cd", "sml_cd"]

    # アイテムの基礎特徴量
    item_features_df = item_df.rename({
        "yad_type":"P_yad_type",
        "total_room_cnt":"P_total_room_cnt",
        "wireless_lan_flg":"P_wireless_lan_flg",
        "onsen_flg":"P_onsen_flg",
        "kd_stn_5min":"P_kd_stn_5min",
        "kd_bch_5min":"P_kd_bch_5min",
        "kd_slp_5min":"P_kd_slp_5min",
        "kd_conv_walk_5min":"P_kd_conv_walk_5min",
        "wid_cd":"P_wid_cd",
        "ken_cd":"P_ken_cd",
        "lrg_cd":"P_lrg_cd",
        "sml_cd":"P_sml_cd",
    })

    # 該当期間内（train or test）におけるアイテムの閲覧回数
    item_features_df = item_features_df.join(
        log_df.group_by("yad_no").count().rename({"count":"P_order_count"}),
        on = "yad_no",
        how="left",
    ).fill_null(0)

    # 全期間におけるアイテムの閲覧回数
    item_features_df = item_features_df.join(
        full_log_df.group_by("yad_no").count().rename({"count":"P_full_order_count"}),
        on = "yad_no",
        how="left",
    ).fill_null(0)

    # 全期間に対する、該当期間内（train or test）におけるアイテムの閲覧回数の割合
    item_features_df = item_features_df.with_columns(
        (pl.col("P_order_count") / pl.col("P_full_order_count")).alias("P_order_count_ratio_in_phase")
    )

    # 該当期間内（train or test）におけるアイテムの所属する地域の予約回数
    for cd in location_cds:    
        item_features_df = item_features_df.join(
            log_df.group_by(cd).count().rename({"count":f"P_order_count_{cd}"}),
            left_on = f"P_{cd}",
            right_on = cd,
            how="left"
        )

    # 全期間におけるアイテムの所属する地域の予約回数
    for cd in location_cds:    
        item_features_df = item_features_df.join(
            full_log_df.group_by(cd).count().rename({"count":f"P_full_order_count_{cd}"}),
            left_on = f"P_{cd}",
            right_on = cd,
            how="left"
        )

    # 該当期間内（train or test）における地域の総予約回数に対する、そのアイテムの予約回数の割合
    for cd in location_cds:
        item_features_df = item_features_df.with_columns([
            (pl.col("P_order_count") / (pl.col(f"P_order_count_{cd}") + 1)).alias(f"P_order_count_ratio_to_{cd}"),
        ])

    # 全期間における地域の総予約回数に対する、そのアイテムの予約回数の割合
    for cd in location_cds:
        item_features_df = item_features_df.with_columns([
            (pl.col("P_full_order_count") / (pl.col(f"P_full_order_count_{cd}") + 1)).alias(f"P_full_order_count_ratio_to_{cd}"),
        ])

    # 不要変数削除
    item_features_df = item_features_df.drop("P_order_count")
    item_features_df = item_features_df.drop("P_full_order_count")
    for cd in location_cds:    
        item_features_df = item_features_df.drop(f"P_order_count_{cd}")
        item_features_df = item_features_df.drop(f"P_full_order_count_{cd}")
        
    return item_features_df

# train/eval

In [5]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
test_log = pl.read_csv(os.path.join(INPUT_DIR, "test_log.csv"))

In [6]:
log = pl.concat([train_log, test_log], how="vertical")

In [7]:
item = pl.read_csv(os.path.join(INPUT_DIR, "yado.csv"))

In [8]:
item_feat = generate_product_features(item, train_log, log)
item_feat.write_parquet(os.path.join(OUTPUT_DIR, "product_feat_train.parquet"))

# test

In [9]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
train_label = pl.read_csv(os.path.join(INPUT_DIR, "train_label.csv"))
test_log = pl.read_csv(os.path.join(INPUT_DIR, "test_log.csv"))

In [10]:
# trainのlabelをlogにappendする

prev_items_list = (
    train_log
    .sort(["session_id", "seq_no"])
    .group_by("session_id", maintain_order=True)
    .agg(pl.col("yad_no"))
)["yad_no"].to_list()

next_item_list = (
    train_label
    .sort("session_id")
)["yad_no"].to_list()

prev_items_list_updated = []
for prev_items, next_item in zip(prev_items_list, next_item_list):
    prev_items.append(next_item)
    prev_items_list_updated.append(prev_items)

train_log = train_label.with_columns(
    pl.Series(name="prev_items", values=prev_items_list_updated)
)

train_log = explode_and_add_seq_no(train_log) \
    .drop("yad_no") \
    .rename({"prev_items" : "yad_no"}) \
    [["session_id", "seq_no", "yad_no"]] # カラム並び替え

In [11]:
log = pl.concat([train_log, test_log], how="vertical")

In [12]:
item = pl.read_csv(os.path.join(INPUT_DIR, "yado.csv"))

In [13]:
item_feat = generate_product_features(item, test_log, log)
item_feat.write_parquet(os.path.join(OUTPUT_DIR, "product_feat_test.parquet"))