In [1]:
import os

import polars as pl

In [2]:
INPUT_DIR = "../../input/raw/"
OUTPUT_DIR = "./features/"

# セッション特徴量

In [3]:
def generate_session_features(log: pl.DataFrame, item: pl.DataFrame) -> pl.DataFrame:
    df = log.join(item, on="yad_no", how="left")

    session_feat_df = df.group_by("session_id").first()[["session_id"]]

    # セッション数
    session_feat_df = session_feat_df.join(
        df.group_by("session_id").agg(pl.count()).rename({"count":"S_session_length"}),
        on="session_id",
        how="left"
    )

    # 各セッションのユニーク特徴量
    session_feat_df = session_feat_df.join(
        df.group_by("session_id").n_unique()[["session_id", "yad_no", "yad_type", "wid_cd", "ken_cd", "lrg_cd", "sml_cd"]].rename(
            {
                "yad_no":"S_nunique_item",
                "yad_type":"S_nunique_yad_type",
                "wid_cd":"S_nunique_wid_cd",
                "ken_cd":"S_nunique_ken_cd",
                "lrg_cd":"S_nunique_lrg_cd",
                "sml_cd":"S_nunique_sml_cd",
            }
        ),
        on="session_id",
        how="left"
    )

    # 宿のルーム数の統計量
    session_feat_df = session_feat_df.join(
        df.group_by("session_id").agg(
            pl.col("total_room_cnt").mean().alias("S_mean_room_cnt"),
            pl.col("total_room_cnt").max().alias("S_max_room_cnt"),
            pl.col("total_room_cnt").min().alias("S_min_room_cnt"),
            pl.col("total_room_cnt").std().alias("S_std_room_cnt"),
            pl.col("total_room_cnt").sum().alias("S_total_room_cnt"),
        ),
        on="session_id",
        how="left"
    )    

    # 各セッションの再予約率
    session_feat_df = session_feat_df.with_columns(
        ((pl.col("S_session_length") - pl.col("S_nunique_item")) / pl.col("S_session_length")).alias("S_ratio_repurchase")
    )

    # 直近3回の商品情報
    last_ns = [1, 2, 3]
    for last_n in last_ns:
        last_n_df = df.filter(pl.col("seq_no") == last_n-1)
        last_n_df = last_n_df[["session_id", "wireless_lan_flg", "onsen_flg", "kd_stn_5min", "kd_bch_5min", "kd_slp_5min", "kd_conv_walk_5min", "wid_cd", "ken_cd", "lrg_cd", "sml_cd"]]
        last_n_df = last_n_df.rename({
            "wireless_lan_flg":f"S_wireless_lan_flg_last{last_n}",
            "onsen_flg":f"S_onsen_flg_last{last_n}",
            "kd_stn_5min":f"S_kd_stn_5min_last{last_n}",
            "kd_bch_5min":f"S_kd_bch_5min_last{last_n}",
            "kd_slp_5min":f"S_kd_slp_5min_last{last_n}",
            "kd_conv_walk_5min":f"S_kd_conv_walk_5min_last{last_n}",
            "wid_cd":f"S_wid_cd_last{last_n}",
            "ken_cd":f"S_ken_cd_last{last_n}",
            "lrg_cd":f"S_lrg_cd_last{last_n}",
            "sml_cd":f"S_sml_cd_last{last_n}",
        })
        session_feat_df = session_feat_df.join(last_n_df, on="session_id", how="left")

    return session_feat_df

In [4]:
item = pl.read_csv(os.path.join(INPUT_DIR, "yado.csv"))
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
train_label = pl.read_csv(os.path.join(INPUT_DIR, "train_label.csv"))
test_log = pl.read_csv(os.path.join(INPUT_DIR, "test_log.csv"))

In [5]:
train_session_feat = generate_session_features(train_log, item)
train_session_feat.write_parquet(os.path.join(OUTPUT_DIR, "session_feat_for_train_or_eval.parquet"))

In [6]:
test_session_feat = generate_session_features(test_log, item)
test_session_feat.write_parquet(os.path.join(OUTPUT_DIR, "session_feat_for_test.parquet"))