In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from collections import defaultdict, Counter
from typing import List, Dict

from tqdm import tqdm
import pandas as pd
import polars as pl

In [5]:
train = pd.read_csv("/gdrive/MyDrive/amazon_kdd_2023/data/raw/sessions_train.csv")
test1 = pd.read_csv("/gdrive/MyDrive/amazon_kdd_2023/data/raw/sessions_test_task2.csv")
test2 = pd.read_csv("/gdrive/MyDrive/amazon_kdd_2023/data/raw/sessions_test_task2_phase2.csv")

train = pl.from_pandas(train)
test1 = pl.from_pandas(test1)
test2 = pl.from_pandas(test2)

In [6]:
LOCALES = ["FR", "ES", "IT"]
train = train.filter(pl.col("locale").is_in(LOCALES))
test1 = test1.filter(pl.col("locale").is_in(LOCALES))
test2 = test2.filter(pl.col("locale").is_in(LOCALES))

In [7]:
# prev_itemsの加工
def str2list(s):
    s = s.replace("[", "").replace("]", "").replace("'", "").replace("\n", " ").replace("\r", " ")
    s = s.split() # 空白で区切ってリスト化
    return s

train = train.with_columns(pl.col("prev_items").apply(str2list).alias("prev_items"))
test1 = test1.with_columns(pl.col("prev_items").apply(str2list).alias("prev_items"))
test2 = test2.with_columns(pl.col("prev_items").apply(str2list).alias("prev_items"))

In [9]:
# session_idの付与
train = train.with_columns(pl.Series(name="session_id", values=["train_" + str(i) for i in range(len(train))]))
test1 = test1.with_columns(pl.Series(name="session_id", values=["test_phase1_" + str(i) for i in range(len(test1))]))
test2 = test2.with_columns(pl.Series(name="session_id", values=["test_phase2_" + str(i) for i in range(len(test2))]))

In [17]:
train.head()

prev_items,next_item,locale,session_id
list[str],str,str,str
"[""B08MV5B53K"", ""B08MV4RCQR"", ""B08MV5B53K""]","""B012408XPC""","""ES""","""train_0"""
"[""B07JGW4QWX"", ""B085VCXHXL""]","""B07JFPYN5P""","""ES""","""train_1"""
"[""B08BFQ52PR"", ""B08LVSTZVF"", ""B08BFQ52PR""]","""B08NJP3KT6""","""ES""","""train_2"""
"[""B08PPBF9C6"", ""B08PPBF9C6"", … ""B08PPBF9C6""]","""B08PP6BLLK""","""ES""","""train_3"""
"[""B0B6W67XCR"", ""B0B712FY2M"", ""B0B6ZYJ3S2""]","""B09SL4MBM2""","""ES""","""train_4"""


In [18]:
train.describe()

describe,prev_items,next_item,locale,session_id
str,str,str,str,str
"""count""","""333533""","""333533""","""333533""","""333533"""
"""null_count""","""0""","""0""","""0""","""0"""
"""mean""",,,,
"""std""",,,,
"""min""",,"""0007477155""","""ES""","""train_0"""
"""max""",,"""B0BL7HFKVT""","""IT""","""train_99999"""
"""median""",,,,
"""25%""",,,,
"""75%""",,,,


In [19]:
test1.head()

prev_items,locale,session_id
list[str],str,str
"[""B08GYKNCCP"", ""B08HCPTMJG"", ""B08HCHS64Y""]","""ES""","""test_phase1_0"""
"[""B08NYF9MBQ"", ""B085NGXGWM""]","""ES""","""test_phase1_1"""
"[""B091FL1QFK"", ""B0B1DG29F4""]","""ES""","""test_phase1_2"""
"[""B004APAHCW"", ""B07JMF49HN"", … ""B07JMF49HN""]","""ES""","""test_phase1_3"""
"[""B09YM11D4T"", ""B0B12QWP5G"", … ""B07N8N6C85""]","""ES""","""test_phase1_4"""


In [21]:
test1.describe()

describe,prev_items,locale,session_id
str,str,str,str
"""count""","""34688""","""34688""","""34688"""
"""null_count""","""0""","""0""","""0"""
"""mean""",,,
"""std""",,,
"""min""",,"""ES""","""test_phase1_0"""
"""max""",,"""IT""","""test_phase1_99…"
"""median""",,,
"""25%""",,,
"""75%""",,,


In [22]:
test2.head()

prev_items,locale,session_id
list[str],str,str
"[""B07GTS7SWK"", ""B07GTS7SWK""]","""ES""","""test_phase2_0"""
"[""B0B33YWVHR"", ""849988993X"", … ""B09K7TDY1H""]","""ES""","""test_phase2_1"""
"[""B08FMPXDTJ"", ""B0B4612MTM"", … ""B0B45YR21M""]","""ES""","""test_phase2_2"""
"[""B07R3W4XQ7"", ""B07R3FB5B5""]","""ES""","""test_phase2_3"""
"[""B09V4KBWPL"", ""B09G9FTLPB""]","""ES""","""test_phase2_4"""


In [23]:
test2.describe()

describe,prev_items,locale,session_id
str,str,str,str
"""count""","""34690""","""34690""","""34690"""
"""null_count""","""0""","""0""","""0"""
"""mean""",,,
"""std""",,,
"""min""",,"""ES""","""test_phase2_0"""
"""max""",,"""IT""","""test_phase2_99…"
"""median""",,,
"""25%""",,,
"""75%""",,,


In [24]:
train.write_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task2/train_task2.parquet")

In [25]:
test1.write_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task2/test_task2_phase1.parquet")

In [26]:
test2.write_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task2/test_task2_phase2.parquet")