In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [1]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os
import gc
import random
from collections import defaultdict, Counter
from typing import List, Dict
import joblib
import pickle

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import polars as pl
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [4]:
VER = "10"

In [5]:
train = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task2/train_task2.parquet")
test = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task2/test_task2_phase2.parquet")
product = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/common/product.parquet")

In [6]:
def preprocess(session_df: pl.DataFrame, product_df: pl.DataFrame) -> pl.DataFrame:
    session_df = session_df.explode("prev_items")
    session_df = session_df.with_columns(
        session_df.select(pl.col("session_id").cumcount(reverse=True).over("session_id").alias("sequence_num"))
    )
    session_df = session_df.join(product_df, left_on=["prev_items", "locale"], right_on=["id", "locale"], how="left")
    return session_df

In [11]:
def generate_session_features(df: pl.DataFrame) -> pl.DataFrame:
    session_feat_df = df.groupby("session_id").first()[["session_id", "locale"]]

    # セッションのロケール
    enc = LabelEncoder()
    enc.fit(session_feat_df["locale"])
    session_feat_df = session_feat_df.with_columns(
        pl.Series(name="locale", values=enc.transform(session_feat_df["locale"]))
    ).rename({"locale":"S_locale"})

    # 各セッションのアイテム数
    session_feat_df = session_feat_df.join(
        df.groupby("session_id").agg(pl.count()).rename({"count":"S_session_length"}),
        on="session_id",
        how="left"
    )

    # 各セッションのユニークなアイテム数
    session_feat_df = session_feat_df.join(
        df.groupby("session_id").n_unique()[["session_id", "prev_items"]].rename({"prev_items":"S_nunique_item"}),
        on="session_id",
        how="left"
    )

    # 各セッションのユニークなブランド数
    session_feat_df = session_feat_df.join(
        df.groupby("session_id").n_unique()[["session_id", "brand"]].rename({"brand":"S_nunique_brand"}),
        on="session_id",
        how="left"
    )

    # 各セッションの再購入率、ユニークなブランド数に対するアイテム数の割合
    session_feat_df = session_feat_df.with_columns(
            ((pl.col("S_session_length") - pl.col("S_nunique_item")) / pl.col("S_session_length")).alias("S_ratio_repurchase"),
            (pl.col("S_nunique_brand") / pl.col("S_session_length")).alias("S_ratio_unique_brand")
    )

    # 直近3回の商品情報
    last_ns = [1, 2, 3]
    for last_n in last_ns:
        last_n_df = df.filter(pl.col("sequence_num") == last_n-1)
        last_n_df = last_n_df[["session_id", "price", "brand", "color", "size", "model", "material", "author"]]
        last_n_df = last_n_df.rename({
            "price":f"S_price_last{last_n}",
            "brand":f"S_brand_last{last_n}",
            "color":f"S_color_last{last_n}",
            "size":f"S_size_last{last_n}",
            "model":f"S_model_last{last_n}",
            "material":f"S_material_last{last_n}",
            "author":f"S_author_last{last_n}",
        })
        session_feat_df = session_feat_df.join(last_n_df, on="session_id", how="left")

    session_feat_df = session_feat_df.with_columns(
        ((pl.col("S_price_last1") + pl.col("S_price_last2") + pl.col("S_price_last3")) / 3).alias("S_last_items_mean_price")
    )

    # 各セッションのアイテムのpriceの平均、最大、最小、標準偏差
    session_feat_df = session_feat_df.join(
        df.groupby("session_id").agg(
            pl.col("price").mean().alias("S_mean_price"),
            pl.col("price").max().alias("S_max_price"),
            pl.col("price").min().alias("S_min_price"),
            pl.col("price").std().alias("S_std_price"),
            pl.col("price").sum().alias("S_total_amount"),
        ),
        on="session_id",
        how="left"
    )

    # 購入商品の各カテゴリの非null数
    session_feat_df = session_feat_df.join(
        df.groupby("session_id").agg(
            pl.col("color").is_not_null().sum().alias("S_color_not_null_count"),
            pl.col("size").is_not_null().sum().alias("S_size_not_null_count"),
            pl.col("model").is_not_null().sum().alias("S_model_not_null_count"),
            pl.col("material").is_not_null().sum().alias("S_material_not_null_count"),
            pl.col("author").is_not_null().sum().alias("S_author_not_null_count")
        ),
        on="session_id",
        how="left"
    )
    
    return session_feat_df

In [12]:
session_df = pl.concat([
    train["prev_items", "locale", "session_id"],
    test["prev_items", "locale", "session_id"],
])

In [13]:
session_df = preprocess(session_df, product)

In [14]:
session_features = generate_session_features(session_df)

In [15]:
session_features.head()

session_id,S_locale,S_session_length,S_nunique_item,S_nunique_brand,S_ratio_repurchase,S_ratio_unique_brand,S_price_last1,S_brand_last1,S_color_last1,S_size_last1,S_model_last1,S_material_last1,S_author_last1,S_price_last2,S_brand_last2,S_color_last2,S_size_last2,S_model_last2,S_material_last2,S_author_last2,S_price_last3,S_brand_last3,S_color_last3,S_size_last3,S_model_last3,S_material_last3,S_author_last3,S_last_items_mean_price,S_mean_price,S_max_price,S_min_price,S_std_price,S_total_amount,S_color_not_null_count,S_size_not_null_count,S_model_not_null_count,S_material_not_null_count,S_author_not_null_count
str,i64,u32,u32,u32,f64,f64,f64,str,str,str,str,str,str,f64,str,str,str,str,str,str,f64,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,u32,u32,u32,u32,u32
"""train_91612""",1,11,8,4,0.272727,0.363636,9.99,"""Saniterpen""","""Non""","""One Size""","""5077""",,,9.99,"""Saniterpen""","""Non""","""One Size""","""5077""",,,23.95,"""Elanco""","""14.2000""","""1 Collier""","""1407034""",,,14.643333,16.966364,52.22,5.16,12.64084,186.63,7,10,11,1,0
"""train_44413""",0,2,1,1,0.5,0.5,8.99,"""URAQT""",,,"""C-QX-LQJLCP210…",,,8.99,"""URAQT""",,,"""C-QX-LQJLCP210…",,,,,,,,,,,8.99,8.99,8.99,0.0,17.98,0,0,2,0,0
"""train_78292""",0,2,2,2,0.0,1.0,9.99,"""HOWAF""",,,,,,,"""Leikedun""",,,,,,,,,,,,,,9.99,9.99,9.99,,9.99,0,0,0,0,0
"""train_250071""",2,3,2,1,0.333333,0.333333,4.98,"""Bijoux""","""Grigio""",,"""BIJ-BOTTALUM50…","""Alluminio""",,4.98,"""Bijoux""","""Nero""",,""""" BIJ-BOTTALUM…","""Alluminio""",,4.98,"""Bijoux""","""Nero""",,""""" BIJ-BOTTALUM…","""Alluminio""",,4.98,4.98,4.98,4.98,0.0,14.94,3,0,3,3,0
"""train_266511""",2,2,1,1,0.5,0.5,13.99,"""Clementoni""","""Colori Assorti…",,"""11984""",,,13.99,"""Clementoni""","""Colori Assorti…",,"""11984""",,,,,,,,,,,13.99,13.99,13.99,0.0,27.98,2,0,2,0,0


In [16]:
session_features.write_parquet(f"/gdrive/MyDrive/amazon_kdd_2023/data/interim/features/task2/session_feature_{VER}.parquet", use_pyarrow=True)