In [None]:
!mkdir /content/drive/MyDrive/kddcup2023-origin

In [None]:
!cp -r /content/drive/MyDrive/kddcup2023-pgy/kddcup2023 /content/drive/MyDrive/kddcup2023-origin
# !/content/drive/MyDrive/kddcup2023-ly/kddcup2023

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install polars



In [3]:
import os
import gc
import random
from collections import defaultdict, Counter
from typing import List, Dict
import joblib
import pickle

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import polars as pl
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [4]:
VER = "06"

In [6]:
train = pl.read_parquet("/content/drive/MyDrive/kddcup2023-ly/kddcup2023/data/preprocessed/task1/train_task1.parquet")
test = pl.read_parquet("/content/drive/MyDrive/kddcup2023-ly/kddcup2023/data/preprocessed/task1/test_task1_phase2.parquet")
product = pl.read_parquet("/content/drive/MyDrive/kddcup2023-ly/kddcup2023/data/preprocessed/common/product_04.parquet")

In [12]:
print(train.columns)
print(test.columns)

['prev_items', 'next_item', 'locale', 'session_id']
['prev_items', 'locale', 'session_id']


In [13]:
print("prev_items" in train.columns)
print("locale" in train.columns)
print("session_id" in train.columns)

print("prev_items" in test.columns)
print("locale" in test.columns)
print("session_id" in test.columns)

True
True
True
True
True
True


In [9]:
def preprocess(session_df: pl.DataFrame, product_df: pl.DataFrame) -> pl.DataFrame:
    session_df = session_df.explode("prev_items")
    session_df = session_df.with_columns(
        session_df.select(pl.col("session_id").cumcount(reverse=True).over("session_id").alias("sequence_num"))
    )
    session_df = session_df.join(product_df, left_on=["prev_items", "locale"], right_on=["id", "locale"], how="left")
    return session_df

In [10]:
def generate_session_features(df: pl.DataFrame) -> pl.DataFrame:
    session_feat_df = df.groupby("session_id").first()[["session_id", "locale"]]

    # session locale
    enc = LabelEncoder()
    enc.fit(session_feat_df["locale"])
    session_feat_df = session_feat_df.with_columns(
        pl.Series(name="locale", values=enc.transform(session_feat_df["locale"]))
    ).rename({"locale":"S_locale"})

    # Number of items in each session
    session_feat_df = session_feat_df.join(
        df.groupby("session_id").agg(pl.count()).rename({"count":"S_session_length"}),
        on="session_id",
        how="left"
    )

    # Number of unique items in each
    session_feat_df = session_feat_df.join(
        df.groupby("session_id").n_unique()[["session_id", "prev_items"]].rename({"prev_items":"S_nunique_item"}),
        on="session_id",
        how="left"
    )

    # Number of unique brands in each sessio
    session_feat_df = session_feat_df.join(
        df.groupby("session_id").n_unique()[["session_id", "brand"]].rename({"brand":"S_nunique_brand"}),
        on="session_id",
        how="left"
    )

    # Repurchase rate for each session, ratio of number of items to number of unique brands
    session_feat_df = session_feat_df.with_columns(
            ((pl.col("S_session_length") - pl.col("S_nunique_item")) / pl.col("S_session_length")).alias("S_ratio_repurchase"),
            (pl.col("S_nunique_brand") / pl.col("S_session_length")).alias("S_ratio_unique_brand")
    )

    # Mean, maximum, minimum, and standard deviation of PRICE of items for each session
    session_feat_df = session_feat_df.join(
        df.groupby("session_id").agg(
            pl.col("price").mean().alias("S_mean_price"),
            pl.col("price").max().alias("S_max_price"),
            pl.col("price").min().alias("S_min_price"),
            pl.col("price").std().alias("S_std_price"),
            pl.col("price").sum().alias("S_total_amount"),
        ),
        on="session_id",
        how="left"
    )
    # Price of items purchased at the end of each session
    session_feat_df = session_feat_df.join(
        df.groupby("session_id", maintain_order=True).tail(1)[["session_id", "price"]].rename({"price":"S_last_item_price"}),
        on="session_id",
        how="left"
    )

    # Non-null count of each category of purchased products
    session_feat_df = session_feat_df.join(
        df.groupby("session_id").agg(
            pl.col("color").is_not_null().sum().alias("S_color_not_null_count"),
            pl.col("size").is_not_null().sum().alias("S_size_not_null_count"),
            pl.col("model").is_not_null().sum().alias("S_model_not_null_count"),
            pl.col("material").is_not_null().sum().alias("S_material_not_null_count"),
            pl.col("author").is_not_null().sum().alias("S_author_not_null_count")
        ),
        on="session_id",
        how="left"
    )

    # Product information for the last 3 times
    # model是型号 三级类目
    last_ns = [1, 2, 3]
    for last_n in last_ns:
        last_n_df = df.filter(pl.col("sequence_num") == last_n-1)
        last_n_df = last_n_df[["session_id", "brand", "color", "size", "model", "material", "author"]]
        last_n_df = last_n_df.rename({
            "brand":f"S_brand_last{last_n}",
            "color":f"S_color_last{last_n}",
            "size":f"S_size_last{last_n}",
            "model":f"S_model_last{last_n}",
            "material":f"S_material_last{last_n}",
            "author":f"S_author_last{last_n}",
        })
        session_feat_df = session_feat_df.join(last_n_df, on="session_id", how="left")

    return session_feat_df

In [16]:
session_df = pl.concat([
    train[["prev_items", "locale", "session_id"]],
    test[["prev_items", "locale", "session_id"]],
])

In [17]:
session_df.head()

prev_items,locale,session_id
list[str],str,str
"[""B09W9FND7K"", ""B09JSPLN1M""]","""DE""","""train_0"""
"[""B076THCGSG"", ""B007MO8IME"", … ""B001B4TKA0""]","""DE""","""train_1"""
"[""B0B1LGXWDS"", ""B00AZYORS2"", … ""B00AZYORS2""]","""DE""","""train_2"""
"[""B09XMTWDVT"", ""B0B4MZZ8MB"", … ""B0B71CHT1L""]","""DE""","""train_3"""
"[""B09Y5CSL3T"", ""B09Y5DPTXN"", ""B09FKD61R8""]","""DE""","""train_4"""


In [18]:
session_df = preprocess(session_df, product)

  session_df.select(pl.col("session_id").cumcount(reverse=True).over("session_id").alias("sequence_num"))


In [19]:
session_features = generate_session_features(session_df)

  session_feat_df = df.groupby("session_id").first()[["session_id", "locale"]]
  df.groupby("session_id").agg(pl.count()).rename({"count":"S_session_length"}),
  df.groupby("session_id").n_unique()[["session_id", "prev_items"]].rename({"prev_items":"S_nunique_item"}),
  df.groupby("session_id").n_unique()[["session_id", "brand"]].rename({"brand":"S_nunique_brand"}),
  df.groupby("session_id").agg(
  df.groupby("session_id", maintain_order=True).tail(1)[["session_id", "price"]].rename({"price":"S_last_item_price"}),
  df.groupby("session_id").agg(


In [None]:
session_features.write_parquet(f"/content/drive/MyDrive/kddcup2023/data/interim/features/task1/session_feature_{VER}.parquet", use_pyarrow=True)

In [None]:
session_df.head()


prev_items,locale,session_id,sequence_num,title,price,brand,color,size,model,material,author,desc,available_locales
str,str,str,u32,str,f64,str,str,str,str,str,str,str,list[str]
"""B09W9FND7K""","""DE""","""train_0""",1,"""OREiN LED Einb…",39.99,"""OREiN""","""6*ip65 (3000k,…","""Lochgröße Φ68-…",,"""塑料""",,"""【Angenehmes Li…","[""DE"", ""UK"", ""FR""]"
"""B09JSPLN1M""","""DE""","""train_0""",0,"""OREiN LED Spot…",36.99,"""OREiN""","""6*ip65(3000k)""","""Lochgröße Φ68-…",,"""Kunststoff""",,"""【Angenehmes Li…","[""DE"", ""UK"", ""FR""]"
"""B076THCGSG""","""DE""","""train_1""",3,"""VIETSCHI Profi…",39.8,"""Vietschi""",,"""12.5 l (1er Pa…",,,,"""✅ Einfache Ver…","[""DE""]"
"""B007MO8IME""","""DE""","""train_1""",2,"""Caparol Capa D…",40.27,"""Caparol""","""Weiss""","""12.5 l (1er Pa…","""11283000""","""Dispersionsfar…",,"""Caparol""","[""DE""]"
"""B08MF65MLV""","""DE""","""train_1""",1,"""Metzler Edelst…",29.99,"""Metzler""","""Anthrazit""","""8 x 6 cm""",,"""Edelstahl V2A""",,"""Der Anschluss …","[""DE""]"


In [None]:
session_features.head()


session_id,S_locale,S_session_length,S_nunique_item,S_nunique_brand,S_ratio_repurchase,S_ratio_unique_brand,S_mean_price,S_max_price,S_min_price,S_std_price,S_total_amount,S_last_item_price,S_color_not_null_count,S_size_not_null_count,S_model_not_null_count,S_material_not_null_count,S_author_not_null_count,S_brand_last1,S_color_last1,S_size_last1,S_model_last1,S_material_last1,S_author_last1,S_brand_last2,S_color_last2,S_size_last2,S_model_last2,S_material_last2,S_author_last2,S_brand_last3,S_color_last3,S_size_last3,S_model_last3,S_material_last3,S_author_last3
str,i64,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,u32,u32,u32,u32,u32,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""train_1182994""",1,3,3,2,0.0,0.666667,10.742857,13.507143,9.235714,2.397203,32.228571,13.507143,0,3,0,0,0,"""エッセンシャル""",,"""2.0リットル (x 1)""",,,,"""DX(デラックス)""",,"""1リットル (x 1)""",,,,"""DX(デラックス)""",,"""1リットル (x 1)""",,,
"""train_490007""",0,3,2,2,0.333333,0.666667,22.663333,26.5,14.99,6.645302,67.99,14.99,1,1,1,2,0,"""Bonlux""","""Warmweiß""","""5 Stück (1er P…","""UK-INS-0596-WW…",,,"""Marhynchus""",,,,"""Material""",,"""Marhynchus""",,,,"""Material""",
"""train_1821541""",1,2,2,1,0.0,0.5,73.485714,92.128571,54.842857,26.364981,146.971429,54.842857,0,0,1,0,0,"""任天堂""",,,,,,"""任天堂""",,,"""2200630055090""",,,,,,,,
"""train_2472402""",2,2,1,1,0.5,0.5,4.99,4.99,4.99,0.0,9.98,4.99,2,0,0,0,0,"""YT Direct""","""Natural Bamboo…",,,,,"""YT Direct""","""Natural Bamboo…",,,,,,,,,,
"""train_2590079""",2,6,6,5,0.0,0.833333,47.878,50.71,43.99,3.13425,239.39,50.71,6,4,5,1,0,"""Russell Hobbs""","""Black""","""Black 600ml""","""RHDH1061B""",,,"""Russell Hobbs""","""Grey""","""Grey 600ml""","""RHDH1061G""",,,"""CONOPU""","""Black""",,"""OZC20S03""",,
