In [1]:
import os

import polars as pl

In [2]:
INPUT_DIR = "../../input/raw/"
OUTPUT_DIR = "./features/"

# セッション * アイテム特徴量

In [3]:
def generate_session_item_features(log: pl.DataFrame, item: pl.DataFrame) -> pl.DataFrame:
    df = log.join(item, on="yad_no", how="left")
    session_item_feat_df = df.group_by(["session_id", "yad_no"]).first()[["session_id", "yad_no"]]

    # セッション内でそのアイテムが何回インタラクトされたか
    session_item_feat_df = session_item_feat_df.join(
        df.group_by(["session_id", "yad_no"]).agg(pl.count()).rename({"count":"SP_interact_count"}),
        on=["session_id", "yad_no"],
        how="left"
    )

    return session_item_feat_df

In [4]:
item = pl.read_csv(os.path.join(INPUT_DIR, "yado.csv"))
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
train_label = pl.read_csv(os.path.join(INPUT_DIR, "train_label.csv"))
test_log = pl.read_csv(os.path.join(INPUT_DIR, "test_log.csv"))

In [5]:
train_session_feat = generate_session_item_features(train_log, item)
train_session_feat.write_parquet(os.path.join(OUTPUT_DIR, "session_product_feat_for_train_or_eval.parquet"))

In [6]:
test_session_feat = generate_session_item_features(test_log, item)
test_session_feat.write_parquet(os.path.join(OUTPUT_DIR, "session_product_feat_for_test.parquet"))

In [7]:
train_session_feat.describe()

describe,session_id,yad_no,SP_interact_count
str,str,f64,f64
"""count""","""398151""",398151.0,398151.0
"""null_count""","""0""",0.0,0.0
"""mean""",,6882.219786,1.053043
"""std""",,4017.078827,0.240864
"""min""","""000007603d533d…",2.0,1.0
"""25%""",,3338.0,1.0
"""50%""",,6874.0,1.0
"""75%""",,10350.0,1.0
"""max""","""fffffa7baf3700…",13806.0,5.0


In [8]:
train_session_feat["SP_interact_count"].value_counts().with_columns((pl.col("counts") / len(train_session_feat)).alias("proportion"))

SP_interact_count,counts,proportion
u32,u32,f64
3,1199,0.003011
5,6,1.5e-05
1,378459,0.950541
2,18382,0.046168
4,105,0.000264


In [9]:
test_session_feat.describe()

describe,session_id,yad_no,SP_interact_count
str,str,f64,f64
"""count""","""238096""",238096.0,238096.0
"""null_count""","""0""",0.0,0.0
"""mean""",,6860.822231,1.051278
"""std""",,3983.6658,0.235144
"""min""","""00001149e9c739…",1.0,1.0
"""25%""",,3362.0,1.0
"""50%""",,6844.0,1.0
"""75%""",,10312.0,1.0
"""max""","""ffffe984aafd61…",13806.0,4.0


In [10]:
test_session_feat["SP_interact_count"].value_counts().with_columns((pl.col("counts") / len(test_session_feat)).alias("proportion"))

SP_interact_count,counts,proportion
u32,u32,f64
2,10735,0.045087
1,226642,0.951893
3,683,0.002869
4,36,0.000151
