In [None]:
#!/bin/bash
!mkdir files && curl -L -o files/userbehavior.zip\
  https://www.kaggle.com/api/v1/datasets/download/marwa80/userbehavior && unzip files/userbehavior.zip -d files


In [None]:
!pip install polars

Collecting polars
  Downloading polars-1.23.0-cp39-abi3-macosx_11_0_arm64.whl.metadata (14 kB)
Downloading polars-1.23.0-cp39-abi3-macosx_11_0_arm64.whl (30.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.4/30.4 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: polars
Successfully installed polars-1.23.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import polars as pl

dataset = pl.read_csv(
    "files/userbehavior.csv", 
    has_header=False,
    new_columns=["user_id", "item_id", "category_id", "behavior", "ts"]
)

In [4]:
ts_cutoff = (
    dataset
    .select("ts")
    .cast(pl.Int32)
    .quantile(0.9)
)

ts_cutoff['ts'][0]

1512271106.0

In [5]:
train_dataset = (
    dataset
    .filter(pl.col("ts") < ts_cutoff['ts'][0])
)

test_dataset_user_ids = (
    dataset
    .group_by("user_id")
    .agg(pl.max("ts").alias("latest_appearence"))
    .filter(pl.col("latest_appearence") >= ts_cutoff['ts'][0])
)

test_dataset = (
    dataset
    .join(test_dataset_user_ids, "user_id")
)

test_dataset_per_user_cutoff = (
    test_dataset
    .filter(pl.col("ts") >= ts_cutoff['ts'][0])
    .sort(["user_id", "ts"])
    .with_columns(
        pl.col("ts").cast(pl.Int32).rank(method="ordinal", descending=False).over("user_id").alias("rank"),
        pl.col("ts").alias("ts_cutoff")
    )
    .filter(pl.col("rank") == 1)
    .select(["user_id", "ts_cutoff"])
)

test_dataset = (
    test_dataset
    .join(test_dataset_per_user_cutoff, "user_id")
    .filter(pl.col("ts") <= pl.col("ts_cutoff"))
)

In [9]:
item_mapping = (
    train_dataset
    .group_by("item_id")
    .agg(pl.count("item_id").alias("num_occurrences"))
    .filter(pl.col("num_occurrences") > 50)
    .sort("num_occurrences", descending=True)
    .with_columns(pl.col("num_occurrences").rank(descending=True, method="ordinal").alias("rank_item"))
)

In [None]:
item_mapping["rank_item"].min(), item_mapping["rank_item"].max()

(1, 315689)

In [11]:
category_mapping = (
    train_dataset
    .group_by("category_id")
    .agg(pl.count("category_id").alias("num_occurrences"))
    .filter(pl.col("num_occurrences") > 10)
    .sort("num_occurrences", descending=True)
    .with_columns(pl.col("num_occurrences").rank(descending=True, method="ordinal").alias("rank_category"))
)

In [12]:
category_mapping["rank_category"].max()

7905

In [None]:
(
    train_dataset
    .with_columns(behavior=pl.col("behavior").replace_strict(["pv", "fav", "cart", "buy"], [0, 1, 2, 3]))
    .join(item_mapping, on="item_id")
    .join(category_mapping, on="category_id")
    .sort(["user_id", "ts"])
    .group_by("user_id")
    .agg(
        pl.col("rank_item").alias("items"),
        pl.col("rank_category").alias("categories"),
        pl.col("behavior").alias("behaviors"),
        pl.col("ts").alias("timestamps"),
    )
    .sample(fraction=1.0, shuffle=True)
    .write_parquet("files/train_taobao_preprocessed.parquet")
)

In [None]:
(
    test_dataset
    .with_columns(behavior=pl.col("behavior").replace_strict(["pv", "fav", "cart", "buy"], [0, 1, 2, 3]))
    .join(item_mapping, on="item_id")
    .join(category_mapping, on="category_id")
    .sort(["user_id", "ts"])
    .group_by("user_id")
    .agg(
        pl.col("rank_item").alias("items"),
        pl.col("rank_category").alias("categories"),
        pl.col("behavior").alias("behaviors"),
        pl.col("ts").alias("timestamps"),
    )
    .write_parquet("files/test_taobao_preprocessed.parquet")
)

In [2]:
import polars as pl
train_dataset = pl.read_parquet("files/train_taobao_preprocessed.parquet")

In [13]:
(
    train_dataset
    .with_columns(
        pl.col("items").list.len().alias("num_items"),
        #add column, showing if len(items) == 1
        pl.when(pl.col("items").list.len() == 1)
        .then(pl.lit(1))
        .otherwise(pl.lit(0))
        .alias("is_single_item")
    )
    .select(
        pl.median("num_items"),
        pl.mean("is_single_item")
    )
)

num_items,is_single_item
f64,f64
56.0,0.00164
