In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from collections import defaultdict, Counter
from typing import List, Dict

from tqdm import tqdm
import polars as pl

In [4]:
MAJOR_LOCALES = ["DE", "JP", "UK"]
MINOR_LOCALES = ["FR", "ES", "IT"]
SAMPLE_FRAC = 0.1

In [5]:
train = pl.read_csv("/gdrive/MyDrive/amazon_kdd_2023/data/raw/sessions_train.csv")

In [6]:
train["locale"].value_counts()

locale,counts
str,u32
"""FR""",117561
"""ES""",89047
"""DE""",1111416
"""UK""",1182181
"""JP""",979119
"""IT""",126925


In [7]:
# Add some records in major locale into minor locale
train_major = train.filter(pl.col("locale").is_in(MAJOR_LOCALES))
train_minor = train.filter(pl.col("locale").is_in(MINOR_LOCALES))
train_major = train_major.sample(fraction=SAMPLE_FRAC, seed=42)
train = pl.concat([train_major, train_minor])

In [8]:
train["locale"].value_counts()

locale,counts
str,u32
"""FR""",117561
"""DE""",110820
"""UK""",118259
"""IT""",126925
"""ES""",89047
"""JP""",98192


In [9]:
# prev_itemsの加工
def str2list(s):
    s = s.replace("[", "").replace("]", "").replace("'", "").replace("\n", " ").replace("\r", " ")
    s = s.split() # 空白で区切ってリスト化
    return s

train = train.with_columns(pl.col("prev_items").apply(str2list).alias("prev_items"))

In [10]:
# session_idの付与
train = train.with_columns(pl.Series(name="session_id", values=["train_" + str(i) for i in range(len(train))]))

In [11]:
train.head()

prev_items,next_item,locale,session_id
list[str],str,str,str
"[""B09W9FND7K"", ""B09JSPLN1M""]","""B09M7GY217""","""DE""","""train_0"""
"[""B08V13WR14"", ""B08V1NGKGL"", … ""B07RMW64HG""]","""B07RK85Y1Q""","""JP""","""train_1"""
"[""B07DLXYM38"", ""B0764HS4SL""]","""B09V889MMP""","""UK""","""train_2"""
"[""B07PB8X3R4"", ""B09RD8WFXL""]","""B08Q9KZLY4""","""JP""","""train_3"""
"[""B09G3H5SH5"", ""B07PY4GGXH""]","""B08R4VPNMD""","""UK""","""train_4"""


In [12]:
train.write_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task2/train_task2_02.parquet")