# Amazon Books – Data Preprocessing and Baseline

This notebook prepares the Amazon Books dataset for our project:

1. Normalize and filter the raw Amazon ratings.
2. Create a time-aware leave-one-out (LOO) split.
3. Build candidate pools and a 100-user subset shared with the LLMs.
4. Truncate training histories to a fixed length.
5. Export splits and candidates to CSV.
6. Build and clean item-level metadata (title + merged review description).


## 0. Reset local output folders

This cell removes any previous outputs (`splits/`, `candidates_subset100/`, `csv_export/`) so the notebook can be re-run from scratch without leftover files.


In [2]:
import os
import shutil
from pathlib import Path

BASE = Path(r"C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project")

folders_to_clean = [
    "splits",
    "candidates_subset100",
    "csv_export"
]

for d in folders_to_clean:
    p = BASE / d
    if p.exists():
        shutil.rmtree(p)
        print("[clean] removed", p)
    else:
        print("[clean] not found", p)

[clean] removed C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project\splits
[clean] removed C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project\candidates_subset100
[clean] removed C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project\csv_export


## 1. Load and normalize raw Amazon Books ratings

I start from the `Books_rating.csv` file (Kaggle-style export), rename columns to `userId`, `itemId`, `rating`, `timestamp`, and keep only ratings in \[1, 5].


In [None]:
import pandas as pd
import numpy as np

CSV_PATH = BASE / "Books_rating.csv"
assert CSV_PATH.exists(), f"Missing file: {CSV_PATH}"

df_raw = pd.read_csv(CSV_PATH)
print("Raw columns:", list(df_raw.columns))

df = pd.DataFrame({
    "userId":    df_raw["User_id"],
    "itemId":    df_raw["Id"],
    "rating":    pd.to_numeric(df_raw["review/score"], errors="coerce"),
    "timestamp": pd.to_datetime(df_raw["review/time"], errors="coerce"),
})

df["timestamp"] = df["timestamp"].fillna(pd.Timestamp(2000,1,1))
df["timestamp"] = (df["timestamp"].astype("int64") // 10**9)

df = df[(df["rating"] >= 1) & (df["rating"] <= 5)]

print(df.head(), df.dtypes)

### 1.1 Apply k-core filter

I apply a user–item k-core filter so that every user and item has at least 5 positive interactions. This stabilizes the recommendation baseline and makes the split comparable to MovieLens.


In [3]:
def kcore_filter(df, u_col="userId", i_col="itemId", k_user=5, k_item=5, max_iters=20, verbose=True):
    for it in range(max_iters):
        n0, u0, i0 = len(df), df[u_col].nunique(), df[i_col].nunique()
        uf = df[u_col].value_counts()
        vf = df[i_col].value_counts()
        df = df[df[u_col].isin(uf[uf >= k_user].index)]
        df = df[df[i_col].isin(vf[vf >= k_item].index)]
        n1, u1, i1 = len(df), df[u_col].nunique(), df[i_col].nunique()
        if verbose:
            print(f"[k-core {it+1}] rows {n0:,}->{n1:,}, users {u0:,}->{u1:,}, items {i0:,}->{i1:,}")
        if n1 == n0:
            break
    return df

df = kcore_filter(df, k_user=5, k_item=5)
print(f"[after k-core] users={df['userId'].nunique():,}, items={df['itemId'].nunique():,}, rows={len(df):,}")

ratings_csv = BASE / "ratings.csv"
df.to_csv(ratings_csv, index=False)
print(f"[saved] {ratings_csv} ~ {len(df):,} rows")

Raw columns: ['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness', 'review/score', 'review/time', 'review/summary', 'review/text']
           userId      itemId  rating  timestamp
0   AVCGYZL8FQQTD  1882931173     4.0          0
1  A30TK6U7DNS82R  0826414346     5.0          1
2  A3UH4UZ4RSVO82  0826414346     5.0          1
3  A2MVUWT453QH61  0826414346     4.0          1
4  A22X4XUPKF66MR  0826414346     4.0          1 userId        object
itemId        object
rating       float64
timestamp      int64
dtype: object
[k-core 1] rows 3,000,000->1,077,091, users 1,008,972->82,519, items 221,998->69,986
[k-core 2] rows 1,077,091->977,301, users 82,519->77,225, items 69,986->29,668
[k-core 3] rows 977,301->951,522, users 77,225->70,381, items 29,668->28,742
[k-core 4] rows 951,522->944,212, users 70,381->70,121, items 28,742->27,027
[k-core 5] rows 944,212->941,724, users 70,121->69,534, items 27,027->26,952
[k-core 6] rows 941,724->940,863, users 69,534->69,496, items 2

In [4]:
def time_aware_loo_split(
    ratings_csv: str,
    out_dir: str,
    rating_threshold: float = 4.0,
    min_positives: int = 2,
    also_csv: bool = False,
):
    out = Path(out_dir); (out / "splits").mkdir(parents=True, exist_ok=True)
    ratings = pd.read_csv(ratings_csv)

    need = {"userId","itemId","rating","timestamp"}
    missing = need - set(ratings.columns)
    if missing:
        raise ValueError(f"ratings.csv missing columns: {missing}")

    unit = "ms" if float(ratings["timestamp"].max()) > 1e12 else "s"
    ratings["ts"] = pd.to_datetime(ratings["timestamp"], unit=unit)

    pos = ratings[ratings["rating"] >= rating_threshold].copy()

    pos = pos.drop_duplicates(["userId","itemId"], keep="first")

    cnt = pos.groupby("userId")["itemId"].transform("size")
    pos = pos[cnt >= min_positives].copy()

    pos = pos.sort_values(["userId","ts"], kind="mergesort")
    pos["n"]   = pos.groupby("userId")["userId"].transform("size")
    pos["idx"] = pos.groupby("userId").cumcount()
    pos["split"] = "train"
    pos.loc[pos["idx"] == pos["n"]-1, "split"] = "test"
    pos.loc[(pos["n"] >= 3) & (pos["idx"] == pos["n"]-2), "split"] = "val"

    train = pos[pos["split"]=="train"][["userId","itemId","ts"]].reset_index(drop=True)
    val_targets  = (pos[pos["split"]=="val"][["userId","itemId","ts"]]
                    .rename(columns={"itemId":"val_item","ts":"ts_val"}).reset_index(drop=True))
    test_targets = (pos[pos["split"]=="test"][["userId","itemId","ts"]]
                    .rename(columns={"itemId":"test_item","ts":"ts_test"}).reset_index(drop=True))

    uids = pd.DataFrame(sorted(train["userId"].unique()), columns=["userId"]); uids["uid"] = range(len(uids))
    iids = pd.DataFrame(sorted(train["itemId"].unique()), columns=["itemId"]); iids["iid"] = range(len(iids))

    val_idx  = (val_targets.merge(uids, on="userId", how="inner")
                          .merge(iids, left_on="val_item", right_on="itemId", how="left")
                          .drop(columns=["itemId"]))
    test_idx = (test_targets.merge(uids, on="userId", how="inner")
                          .merge(iids, left_on="test_item", right_on="itemId", how="left")
                          .drop(columns=["itemId"]))

    sp = out / "splits"
    train.to_parquet(sp / "train.parquet", index=False)
    if len(val_targets):  val_targets.to_parquet(sp / "val_targets.parquet", index=False)
    test_targets.to_parquet(sp / "test_targets.parquet", index=False)
    uids.to_parquet(sp / "user_id_map.parquet", index=False)
    iids.to_parquet(sp / "item_id_map.parquet", index=False)
    (train.merge(uids, on="userId", how="inner")
          .merge(iids, on="itemId", how="inner")
          .drop(columns=["userId","itemId"])
          .to_parquet(sp / "train_indexed.parquet", index=False))
    if len(val_idx):   val_idx.to_parquet(sp / "val_targets_indexed.parquet", index=False)
    test_idx.to_parquet(sp / "test_targets_indexed.parquet", index=False)

    cold_val  = int(val_idx["iid"].isna().sum()) if len(val_idx) else 0
    cold_test = int(test_idx["iid"].isna().sum()) if "iid" in test_idx else 0
    stats = f"""Time-aware LOO split summary
Users (TRAIN map): {len(uids):,}
Items (TRAIN map): {len(iids):,}
TRAIN positives : {len(train):,}
VAL users       : {val_idx["uid"].nunique() if len(val_idx) else 0:,}
TEST users      : {test_idx["uid"].nunique() if len(test_idx) else 0:,}
Cold-start VAL items : {cold_val}
Cold-start TEST items: {cold_test}
"""
    (sp / "stats.txt").write_text(stats, encoding="utf-8")
    print(stats)

## 2. Create time-aware leave-one-out (LOO) split

I convert timestamps to a consistent unit and create a time-aware leave-one-out split:

1. Keep only ratings ≥ 3.0 as positives.
2. Require at least 5 positives per user.
3. For each user, use the most recent item as test, the second most recent as validation, and the rest as train.

The outputs are written to `splits/` as Parquet files (`train.parquet`, `val_targets.parquet`, `test_targets.parquet`, plus `user_id_map` and `item_id_map`).

In [5]:
import pandas as pd
import numpy as np
from pathlib import Path

def _detect_ts_unit(ts_series: pd.Series) -> str:
    vmax = float(ts_series.max())
    return "ms" if vmax > 1e12 else "s"

def time_aware_loo_split(
    ratings_csv: str,
    out_dir: str,
    rating_threshold: float = 3.0,
    min_positives: int = 3,
    also_csv: bool = False,
):
    out = Path(out_dir); (out / "splits").mkdir(parents=True, exist_ok=True)
    ratings = pd.read_csv(ratings_csv)

    need = {"userId","itemId","rating","timestamp"}
    missing = need - set(ratings.columns)
    if missing:
        raise ValueError(f"ratings.csv missing columns: {missing}")


    try:
        unit = _detect_ts_unit(ratings["timestamp"])
        ratings["ts"] = pd.to_datetime(ratings["timestamp"], unit=unit)
    except Exception:

        ratings["ts"] = pd.to_datetime(ratings["timestamp"], unit="s", origin="unix", errors="ignore")

    pos = ratings[ratings["rating"] >= rating_threshold].copy()
    pos = pos.sort_values(["userId","itemId","ts"], kind="mergesort").drop_duplicates(["userId","itemId"], keep="first")
    pos = pos.groupby("userId").filter(lambda g: len(g) >= min_positives)

    pos = pos.sort_values(["userId","ts"], kind="mergesort")
    pos["n"]   = pos.groupby("userId")["itemId"].transform("size")
    pos["idx"] = pos.groupby("userId").cumcount()

    pos["split"] = "train"
    pos.loc[pos["idx"]==pos["n"]-1, "split"] = "test"
    pos.loc[(pos["n"]>=3) & (pos["idx"]==pos["n"]-2), "split"] = "val"

    train = pos[pos["split"]=="train"][["userId","itemId","ts"]].reset_index(drop=True)
    val_targets  = pos[pos["split"]=="val"][["userId","itemId","ts"]].rename(columns={"itemId":"val_item","ts":"ts_val"}).reset_index(drop=True)
    test_targets = pos[pos["split"]=="test"][["userId","itemId","ts"]].rename(columns={"itemId":"test_item","ts":"ts_test"}).reset_index(drop=True)

    uids = pd.DataFrame(sorted(train["userId"].unique()), columns=["userId"]); uids["uid"]=range(len(uids))
    iids = pd.DataFrame(sorted(train["itemId"].unique()), columns=["itemId"]); iids["iid"]=range(len(iids))

    train_idx = (train.merge(uids, on="userId").merge(iids, on="itemId"))
    val_idx   = (val_targets.merge(uids, on="userId", how="inner")
                           .merge(iids, left_on="val_item", right_on="itemId", how="left")
                           .drop(columns=["itemId"]))
    test_idx  = (test_targets.merge(uids, on="userId", how="inner")
                             .merge(iids, left_on="test_item", right_on="itemId", how="left")
                             .drop(columns=["itemId"]))

    sp = Path(out_dir) / "splits"
    train.to_parquet(sp/"train.parquet", index=False)
    val_targets.to_parquet(sp/"val_targets.parquet", index=False)
    test_targets.to_parquet(sp/"test_targets.parquet", index=False)
    uids.to_parquet(sp/"user_id_map.parquet", index=False)
    iids.to_parquet(sp/"item_id_map.parquet", index=False)
    train_idx.to_parquet(sp/"train_indexed.parquet", index=False)
    val_idx.to_parquet(sp/"val_targets_indexed.parquet", index=False)
    test_idx.to_parquet(sp/"test_targets_indexed.parquet", index=False)

    if also_csv:
        for p in ["train","val_targets","test_targets","user_id_map","item_id_map","train_indexed","val_targets_indexed","test_targets_indexed"]:
            pd.read_parquet(sp/f"{p}.parquet").to_csv(sp/f"{p}.csv", index=False)

    cold_val  = int(val_idx["iid"].isna().sum()) if "iid" in val_idx.columns else 0
    cold_test = int(test_idx["iid"].isna().sum()) if "iid" in test_idx.columns else 0
    stats = f"""Time-aware LOO split summary
Users (TRAIN map): {len(uids)}
Items (TRAIN map): {len(iids)}
TRAIN positives : {len(train)}
VAL users       : {len(val_targets['userId'].unique())}
TEST users      : {len(test_targets['userId'].unique())}
Cold-start VAL items : {cold_val}
Cold-start TEST items: {cold_test}
"""
    (sp/"stats.txt").write_text(stats, encoding="utf-8")
    print(stats)

# Run split on the normalized Kaggle CSV
time_aware_loo_split(
    ratings_csv=str(BASE/"ratings.csv"),
    out_dir=str(BASE),
    rating_threshold=3.0,
    min_positives=5,
    also_csv=False
)

Time-aware LOO split summary
Users (TRAIN map): 60597
Items (TRAIN map): 26522
TRAIN positives : 690814
VAL users       : 60597
TEST users      : 60597
Cold-start VAL items : 228
Cold-start TEST items: 994



In [6]:
import pandas as pd

SPLITS = BASE / "splits"
train_idx = pd.read_parquet(SPLITS/"train_indexed.parquet")
val_idx   = pd.read_parquet(SPLITS/"val_targets_indexed.parquet")
test_idx  = pd.read_parquet(SPLITS/"test_targets_indexed.parquet")

train_items = set(train_idx["iid"].unique())
val_keep  = val_idx[val_idx["iid"].isin(train_items)].copy()
test_keep = test_idx[test_idx["iid"].isin(train_items)].copy()

val_keep.to_parquet(SPLITS/"val_targets_indexed.parquet", index=False)
test_keep.to_parquet(SPLITS/"test_targets_indexed.parquet", index=False)
print("[covered] kept val:", len(val_keep), " / test:", len(test_keep))


[covered] kept val: 60369  / test: 59603


## 3. Build implicit feedback matrix and item–item baseline

Here I:

1. Load the indexed train / val / test splits from `splits/`.
2. Build an implicit user–item matrix `R` from the last 200 interactions per user.
3. Compute an item–item similarity map (`item_sim_map`) using cosine similarity.
4. Use this to construct a popularity + item-sim candidate pool for each user.

In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.sparse import csr_matrix

SPLITS = BASE / "splits"
train_idx = pd.read_parquet(SPLITS/"train_indexed.parquet")   # [uid, iid, ts]
val_idx   = pd.read_parquet(SPLITS/"val_targets_indexed.parquet")   # [uid, val_item(iid), ts_val]
test_idx  = pd.read_parquet(SPLITS/"test_targets_indexed.parquet")  # [uid, test_item(iid), ts_test]

U = int(train_idx["uid"].max()) + 1
I = int(train_idx["iid"].max()) + 1

user_seen = train_idx.groupby("uid")["iid"].apply(set).to_dict()

def candidate_coverage(cand_df, targets_df, tgt_col="iid"):
    df = cand_df.merge(targets_df[["uid", tgt_col]], on="uid", how="inner")
    df = df[df[tgt_col].notna()]
    return np.mean([(int(t) in set(c)) for t, c in zip(df[tgt_col], df["candidates"])])

In [8]:
train_idx_sorted = train_idx.sort_values(["uid","ts"]).groupby("uid").tail(200)

R = csr_matrix(
    (np.ones(len(train_idx_sorted), dtype=np.float32),
     (train_idx_sorted["uid"].astype(int).values,
      train_idx_sorted["iid"].astype(int).values)),
    shape=(U, I),
    dtype=np.float32
)

C = (R.T @ R).tocsr()
C.setdiag(0); C.eliminate_zeros()

M_SIM = 50
item_sim_map = {}
for iid in range(I):
    a, b = C.indptr[iid], C.indptr[iid+1]
    if a == b:
        item_sim_map[iid] = []
        continue
    neigh = C.indices[a:b]
    vals  = C.data[a:b]
    if len(neigh) > M_SIM:
        top = np.argpartition(-vals, M_SIM)[:M_SIM]
        neigh, vals = neigh[top], vals[top]
    order = np.argsort(-vals)
    item_sim_map[iid] = neigh[order].tolist()

print(f"[item-sim] built for I={I:,}. Example of item 0:", item_sim_map.get(0, [])[:10])

[item-sim] built for I=26,522. Example of item 0: [22817, 8, 8128, 20791, 486, 14817, 22719, 22573, 13328, 4209]


### 3.1 Create 100-user candidate subset

To keep the LLM experiments lightweight, I select 100 users who have both validation and test items, and save their candidate pools plus val/test targets into `candidates_subset100/`.


In [9]:
import pandas as pd
import numpy as np

SPLITS = BASE / "splits"

train_idx = pd.read_parquet(SPLITS / "train_indexed.parquet")
val_idx   = pd.read_parquet(SPLITS / "val_targets_indexed.parquet")
test_idx  = pd.read_parquet(SPLITS / "test_targets_indexed.parquet")

users_with_val  = set(val_idx['uid'].unique())
users_with_test = set(test_idx['uid'].unique())
candidate_users = sorted(users_with_val & users_with_test)

subset_users = candidate_users[:100]

print("Users with val+test:", len(candidate_users))
print("Selected 100 users:", len(subset_users))

train_sub = train_idx[train_idx['uid'].isin(subset_users)].copy()
val_tgt   = val_idx[val_idx['uid'].isin(subset_users)].copy()
test_tgt  = test_idx[test_idx['uid'].isin(subset_users)].copy()

item_pop_series = train_sub['iid'].value_counts().astype(float)
item_pop_series /= item_pop_series.max()
item_pop = item_pop_series.to_dict()
item_popular = sorted(item_pop.items(), key=lambda x: -x[1])

def build_pool_for_user(uid, k=50):
    seen = set(train_sub.loc[train_sub['uid'] == uid, 'iid'].astype(int))
    pool = [int(i) for i, _ in item_popular if i not in seen][:k]
    return pool

rows = []
for u in subset_users:
    rows.append({
        "uid": int(u),
        "candidates": build_pool_for_user(int(u), k=50)
    })

cand = pd.DataFrame(rows)

OUT = BASE / "candidates_subset100"
OUT.mkdir(parents=True, exist_ok=True)
print("Saving 100-user candidates to:", OUT)

cand.to_parquet(OUT / "val.parquet",  index=False)
cand.to_parquet(OUT / "test.parquet", index=False)

val_tgt.to_parquet(OUT / "val_targets_indexed.parquet",  index=False)
test_tgt.to_parquet(OUT / "test_targets_indexed.parquet", index=False)

print("✓ Saved 100-user subset to", OUT)

Users with val+test: 59487
Selected 100 users: 100
Saving 100-user candidates to: C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project\candidates_subset100
✓ Saved 100-user subset to C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project\candidates_subset100


## 4. Force-add ground-truth items into candidate pools

For the 100-user subset, I guarantee that each user's validation and test items are always included in their candidate list. I load `val.parquet` / `test.parquet`, union candidates with the corresponding ground-truth items, and overwrite the candidate files.

In [None]:
import pandas as pd
from pathlib import Path

BASE = Path(r"C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project")
CANDS = BASE / "candidates_subset100"

cand_val  = pd.read_parquet(CANDS / "val.parquet")                   # [uid, candidates]
cand_test = pd.read_parquet(CANDS / "test.parquet")                  # [uid, candidates]

val_tgt   = pd.read_parquet(CANDS / "val_targets_indexed.parquet")   # [uid, iid, ...]
test_tgt  = pd.read_parquet(CANDS / "test_targets_indexed.parquet")  # [uid, iid, ...]

print("Loaded cand_val :", cand_val.shape)
print("Loaded cand_test:", cand_test.shape)
print("Loaded val_tgt  :", val_tgt.shape)
print("Loaded test_tgt :", test_tgt.shape)

cand_map = {int(row.uid): list(row.candidates) for _, row in cand_val.iterrows()}

for uid in sorted(cand_map.keys()):
    uid = int(uid)


    val_items  = set(val_tgt[val_tgt["uid"] == uid]["iid"].astype(int).tolist())
    test_items = set(test_tgt[test_tgt["uid"] == uid]["iid"].astype(int).tolist())
    gt_items   = val_items | test_items

    cur_list = cand_map[uid]
    cur_set  = set(int(i) for i in cur_list)

    added = 0
    for g in gt_items:
        g = int(g)
        if g not in cur_set:
            cur_list.append(g)
            cur_set.add(g)
            added += 1

    cand_map[uid] = cur_list

cand_fixed = pd.DataFrame(
    [{"uid": uid, "candidates": cand_map[uid]} for uid in sorted(cand_map.keys())]
)

cand_fixed.to_parquet(CANDS / "val.parquet", index=False)
cand_fixed.to_parquet(CANDS / "test.parquet", index=False)

print("✓ Ground-truth items have been forced into candidate pools.")
print("  New cand_val shape :", cand_fixed.shape)
print("  (val / test now share exactly the same candidate pools)")

Loaded cand_val : (100, 2)
Loaded cand_test: (100, 2)
Loaded val_tgt  : (100, 5)
Loaded test_tgt : (100, 5)
✓ Ground-truth items have been forced into candidate pools.
  New cand_val shape : (100, 2)
  (val / test now share exactly the same candidate pools)


## 5. Truncate training history to last 5 positives per user

To make the user histories more comparable to MovieLens and to control sequence length, I keep at most 5 most recent positives per user in `train_indexed.parquet`. The original file is backed up as `train_indexed_full_backup.parquet`.


In [11]:
import pandas as pd
from pathlib import Path

BASE = Path(r"C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project")
SPLITS = BASE / "splits"

train_path = SPLITS / "train_indexed.parquet"
train_idx = pd.read_parquet(train_path)

print("Original train_indexed shape:", train_idx.shape)

old_counts = train_idx.groupby("uid")["iid"].size()
print("\n[Old] train positives per user:")
print(old_counts.describe())

def truncate_history(df: pd.DataFrame, k: int = 5) -> pd.DataFrame:
    df_sorted = df.sort_values(["uid", "ts"])
    out = (
        df_sorted
        .groupby("uid", as_index=False)
        .tail(k)
        .sort_values(["uid", "ts"])
        .reset_index(drop=True)
    )
    return out

train_trunc = truncate_history(train_idx, k=5)

new_counts = train_trunc.groupby("uid")["iid"].size()
print("\n[New] train positives per user (after truncation to <=5):")
print(new_counts.describe())

print("\nCheck min/new max history length per user:")
print("  min =", int(new_counts.min()), "  max =", int(new_counts.max()))

backup_path = SPLITS / "train_indexed_full_backup.parquet"
if not backup_path.exists():
    train_idx.to_parquet(backup_path, index=False)
    print("\nBackup saved to:", backup_path)
else:
    print("\nBackup already exists at:", backup_path)

train_trunc.to_parquet(train_path, index=False)
print("\n✔ Overwritten train_indexed.parquet with truncated history.")
print("Final train_indexed shape:", train_trunc.shape)

Original train_indexed shape: (690814, 5)

[Old] train positives per user:
count    60597.000000
mean        11.400135
std         27.487630
min          3.000000
25%          4.000000
50%          6.000000
75%         11.000000
max       2325.000000
Name: iid, dtype: float64

[New] train positives per user (after truncation to <=5):
count    60597.000000
mean         4.460617
std          0.799503
min          3.000000
25%          4.000000
50%          5.000000
75%          5.000000
max          5.000000
Name: iid, dtype: float64

Check min/new max history length per user:
  min = 3   max = 5

Backup saved to: C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project\splits\train_indexed_full_backup.parquet

✔ Overwritten train_indexed.parquet with truncated history.
Final train_indexed shape: (270300, 5)


## 6. Sanity checks on 100-user subset and candidate pools

This cell verifies that:

1. The 100 selected users appear in train/val/test.
2. Per-user interaction counts are reasonable.
3. Validation and test candidate pools share the same set of candidates for each user.

In [12]:
import pandas as pd
from pathlib import Path

BASE = Path(r"C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project")
SPLITS = BASE / "splits"
CANDS  = BASE / "candidates_subset100"

train_idx = pd.read_parquet(SPLITS / "train_indexed.parquet")
val_idx_sub  = pd.read_parquet(CANDS / "val_targets_indexed.parquet")
test_idx_sub = pd.read_parquet(CANDS / "test_targets_indexed.parquet")

cand_val  = pd.read_parquet(CANDS / "val.parquet")
cand_test = pd.read_parquet(CANDS / "test.parquet")

print("Loaded train_idx:", train_idx.shape)
print("Loaded val_idx_sub:",  val_idx_sub.shape)
print("Loaded test_idx_sub:", test_idx_sub.shape)
print("Loaded cand_val:",  cand_val.shape)
print("Loaded cand_test:", cand_test.shape)

users_train = set(train_idx['uid'].unique())
users_val   = set(val_idx_sub['uid'].unique())
users_test  = set(test_idx_sub['uid'].unique())

subset_users = users_val
print("\n[1] User counts")
print("train users in subset:", len(users_train & subset_users))
print("val users:",  len(users_val))
print("test users:", len(users_test))

train_sub = train_idx[train_idx['uid'].isin(subset_users)]

train_counts = train_sub.groupby("uid")['iid'].nunique()
val_counts   = val_idx_sub.groupby("uid")['iid'].nunique()
test_counts  = test_idx_sub.groupby("uid")['iid'].nunique()

print("\n[2] Per-user counts (subset)")
print("Train counts stats:\n", train_counts.describe())
print("Val counts unique values:",  val_counts.unique())
print("Test counts unique values:", test_counts.unique())

print("\n[3] Candidate rows per user")
print("val candidate users:",  cand_val['uid'].nunique())
print("test candidate users:", cand_test['uid'].nunique())

cand_val_sorted  = cand_val.sort_values("uid").reset_index(drop=True)
cand_test_sorted = cand_test.sort_values("uid").reset_index(drop=True)

same_uid_order = (cand_val_sorted['uid'].tolist() ==
                  cand_test_sorted['uid'].tolist())
same_pools = all(
    list(v) == list(t)
    for v, t in zip(cand_val_sorted['candidates'],
                    cand_test_sorted['candidates'])
)

print("\n[4] Same candidate pool for val & test?")
print("same uid order :", same_uid_order)
print("same pools     :", same_pools)

Loaded train_idx: (270300, 5)
Loaded val_idx_sub: (100, 5)
Loaded test_idx_sub: (100, 5)
Loaded cand_val: (100, 2)
Loaded cand_test: (100, 2)

[1] User counts
train users in subset: 100
val users: 100
test users: 100

[2] Per-user counts (subset)
Train counts stats:
 count    100.000000
mean       4.600000
std        0.738549
min        3.000000
25%        4.750000
50%        5.000000
75%        5.000000
max        5.000000
Name: iid, dtype: float64
Val counts unique values: [1]
Test counts unique values: [1]

[3] Candidate rows per user
val candidate users: 100
test candidate users: 100

[4] Same candidate pool for val & test?
same uid order : True
same pools     : True


## 7. Export final splits and candidates to CSV

I export the latest (truncated) splits and the 100-user candidate pools to `csv_export/` as CSV files:

1.`train_indexed.csv`, `val_targets_indexed.csv`, `test_targets_indexed.csv`
2.`user_id_map.csv`, `item_id_map.csv`
3.`candidates_val.csv`, `candidates_test.csv`

These are the files that both the recommender baseline and the LLM prompts will use.


In [13]:
import pandas as pd
from pathlib import Path

BASE = Path("C:/Users/carlk/OneDrive/Documents/uoft/ECE1508H F/Project")
SPLITS = BASE / "splits"
CAND = BASE / "candidates_subset100"

OUT = BASE / "csv_export"
OUT.mkdir(parents=True, exist_ok=True)

print("Loading latest (TRUNCATED) splits...")

train_idx = pd.read_parquet(SPLITS / "train_indexed.parquet")
val_tgt   = pd.read_parquet(SPLITS / "val_targets_indexed.parquet")
test_tgt  = pd.read_parquet(SPLITS / "test_targets_indexed.parquet")

user_map  = pd.read_parquet(SPLITS / "user_id_map.parquet")
item_map  = pd.read_parquet(SPLITS / "item_id_map.parquet")

cand_val  = pd.read_parquet(CAND / "val.parquet")
cand_test = pd.read_parquet(CAND / "test.parquet")

train_idx.to_csv(OUT / "train_indexed.csv", index=False)
val_tgt.to_csv(OUT / "val_targets_indexed.csv", index=False)
test_tgt.to_csv(OUT / "test_targets_indexed.csv", index=False)

user_map.to_csv(OUT / "user_id_map.csv", index=False)
item_map.to_csv(OUT / "item_id_map.csv", index=False)

cand_val.to_csv(OUT / "candidates_val.csv", index=False)
cand_test.to_csv(OUT / "candidates_test.csv", index=False)

print("\n✓ All CSV exported to:", OUT)

Loading latest (TRUNCATED) splits...

✓ All CSV exported to: C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project\csv_export


## 8. Build simple item-level metadata (title + one review field)

I join the `item_id_map` with the original Amazon CSV to get an initial metadata table:

1. `itemId` (string ID from Kaggle)
2. `iid` (indexed item ID)
3. `title`
4. one raw text field (either `review/summary` or `review/text`, depending on availability)

The result is saved as `csv_export/amazonbooks_metadata_with_title_desc.csv`.


In [1]:
import pandas as pd
from pathlib import Path

BASE = Path(r"C:/Users/carlk/OneDrive/Documents/uoft/ECE1508H F/Project")

item_map = pd.read_parquet(BASE / "splits" / "item_id_map.parquet")
print("item_map:", item_map.shape)

meta_path = BASE / "Books_rating.csv"
raw = pd.read_csv(meta_path, dtype=str)
print("raw metadata:", raw.shape)

description_col = "review/summary" if "review/summary" in raw.columns else "review/text"

metadata = raw[["Id", "Title", description_col]].copy()
metadata = metadata.rename(columns={
    "Id": "itemId",
    "Title": "title",
    description_col: "description"
})
print("metadata cleaned:", metadata.shape)

merged = item_map.merge(metadata, on="itemId", how="left")
print("Final merged shape:", merged.shape)

out_file = BASE / "csv_export" / "amazonbooks_metadata_with_title_desc.csv"
merged.to_csv(out_file, index=False)

print("\nSaved final metadata to:", out_file)

item_map: (26522, 2)
raw metadata: (3000000, 10)
metadata cleaned: (3000000, 3)
Final merged shape: (2126508, 4)

Saved final metadata to: C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project\csv_export\amazonbooks_metadata_with_title_desc.csv


### 8.1 Quick quality checks on metadata

I load `amazonbooks_metadata_with_title_desc.csv` and check:

1. required columns (`itemId`, `iid`, `title`, `description`)
2. uniqueness of `itemId` / `iid`
3. missing titles or descriptions
4. a small preview of the first few rows.

In [None]:
import pandas as pd
from pathlib import Path

BASE = Path(r"C:/Users/carlk/OneDrive/Documents/uoft/ECE1508H F/Project")
meta_file = BASE / "csv_export" / "amazonbooks_metadata_with_title_desc.csv"

print("Loading:", meta_file)
df = pd.read_csv(meta_file, dtype=str)
print("Loaded shape:", df.shape)
print("Columns:", list(df.columns))

print("\n[1] Check required columns")
for col in ["itemId", "iid", "title", "description"]:
    print(f"  {col}: {'OK' if col in df.columns else 'MISSING'}")

print("\n[2] Uniqueness checks")
print("  unique itemId:", df["itemId"].nunique())
print("  rows (should match if no duplicated itemId):", len(df))
print("  unique iid    :", df["iid"].nunique())

print("\n[3] Missing values")
print("  missing title       :", df["title"].isna().sum())
print("  missing description :", df["description"].isna().sum())

print("\n[4] Sample rows")
print(df.head(10))

Loading: C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project\csv_export\amazonbooks_metadata_with_title_desc.csv
Loaded shape: (2126508, 4)
Columns: ['itemId', 'iid', 'title', 'description']

[1] Check required columns
  itemId: OK
  iid: OK
  title: OK
  description: OK

[2] Uniqueness checks
  unique itemId: 26522
  rows (should match if no duplicated itemId): 2126508
  unique iid    : 26522

[3] Missing values
  missing title       : 185
  missing description : 330

[4] Sample rows
       itemId iid                  title  \
0  0001047655   0  The Prodigal Daughter   
1  0001047655   0  The Prodigal Daughter   
2  0001047655   0  The Prodigal Daughter   
3  0001047655   0  The Prodigal Daughter   
4  0001047655   0  The Prodigal Daughter   
5  0001047655   0  The Prodigal Daughter   
6  0001047655   0  The Prodigal Daughter   
7  0001047655   0  The Prodigal Daughter   
8  0001047655   0  The Prodigal Daughter   
9  0001047655   0  The Prodigal Daughter   

                   

### 8.2 Inspect raw Kaggle columns

I briefly inspect the original `Books_rating.csv` columns to decide which text field to use as review content.

In [2]:
import pandas as pd

raw = pd.read_csv("C:/Users/carlk/OneDrive/Documents/uoft/ECE1508H F/Project/Books_rating.csv")
print(raw.columns)

Index(['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness',
       'review/score', 'review/time', 'review/summary', 'review/text'],
      dtype='object')


## 9. Merge multiple reviews into short per-item descriptions

To make the Amazon metadata more LLM-friendly, I build a cleaned, merged description per `iid`:

1. Read the original Kaggle CSV and select (`itemId`, `Title`, review text).
2. Join with `item_id_map` so every row has an `iid`.
3. For each title, we try to pick different reviews for different `iid`s with the same title, so the LLM can tell them apart.
4. For each `iid`, we:
   - select up to 5 informative reviews,
   - concatenate them with `" "` as separator,
   - decode HTML entities (e.g. `&quot;` → `"`)
   - truncate the final description to at most 400 characters without cutting words in half.

The final table (`itemId`, `iid`, `title`, `description`) is saved as:

`csv_export/amazonbooks_metadata_merged_per_iid_clean.csv`.

In [2]:
import pandas as pd
from pathlib import Path
import html

BASE = Path(r"C:/Users/carlk/OneDrive/Documents/uoft/ECE1508H F/Project")

ITEM_MAP_PATH = BASE / "splits" / "item_id_map.parquet"
RATING_CSV_PATH = BASE / "Books_rating.csv"

OUT_PATH = BASE / "csv_export" / "amazonbooks_metadata_merged_per_iid_clean.csv"

# hyper-parameters
REVIEWS_PER_IID = 5
MAX_CHARS = 400

item_map = pd.read_parquet(ITEM_MAP_PATH)   # columns: ['itemId', 'iid']
print("item_map:", item_map.shape)

raw = pd.read_csv(RATING_CSV_PATH, dtype=str)
print("raw rating csv:", raw.shape)

if "review/summary" in raw.columns:
    desc_col = "review/summary"
elif "review/text" in raw.columns:
    desc_col = "review/text"
else:
    raise ValueError("No suitable review column found in Books_rating.csv")

df = raw[["Id", "Title", desc_col]].rename(
    columns={
        "Id": "itemId",
        "Title": "title",
        desc_col: "review"
    }
)

# drop empty / NaN reviews
df["review"] = df["review"].fillna("").str.strip()
df = df[df["review"] != ""]
print("after dropping empty reviews:", df.shape)

df = df.merge(item_map, on="itemId", how="inner")
# columns: itemId, title, review, iid
print("after merging with item_map:", df.shape)

rows = []

for title, g_title in df.groupby("title", sort=False):
    g_title = g_title.sample(frac=1.0, random_state=42)
    used_idx = set()

    for iid, g_iid in g_title.groupby("iid", sort=False):
        own = g_iid[~g_iid.index.isin(used_idx)]
        selected = own.head(REVIEWS_PER_IID)

        used_idx.update(selected.index)

        if len(selected) < REVIEWS_PER_IID:
            need = REVIEWS_PER_IID - len(selected)
            others = g_title[~g_title.index.isin(used_idx)]
            extra = others.head(need)
            selected = pd.concat([selected, extra])
            used_idx.update(extra.index)

        if selected.empty:
            continue

        text = " ".join(selected["review"].tolist())

        text = html.unescape(text)

        if len(text) > MAX_CHARS:
            words = text.split()
            acc = []
            cur_len = 0
            for w in words:
                add_len = len(w) + (1 if acc else 0)  # plus space if not first word
                if cur_len + add_len > MAX_CHARS:
                    break
                acc.append(w)
                cur_len += add_len
            text = " ".join(acc)

        rows.append({
            "itemId": selected["itemId"].iloc[0],
            "iid": int(iid),
            "title": title,
            "description": text
        })

meta_final = pd.DataFrame(rows)
print("final metadata shape:", meta_final.shape)
print(meta_final.head(5))

OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
meta_final.to_csv(OUT_PATH, index=False, encoding="utf-8")

print("\nSaved merged iid-level metadata to:", OUT_PATH)

item_map: (26522, 2)
raw rating csv: (3000000, 10)
after dropping empty reviews: (2999593, 3)
after merging with item_map: (2126178, 4)
final metadata shape: (26521, 4)
       itemId    iid                               title  \
0  0826414346   8253            Dr. Seuss: American Icon   
1  0789480662   7476   Eyewitness Travel Guide to Europe   
2  006000486X    143             Tess and the Highlander   
3  0671551345   4970  Night World: Daughters Of Darkness   
4  B000N7612G  23576                    The Food Of Love   

                                         description  
0  And to think that I read it on the tram! Essen...  
1  Great travel guide to Europe! nice eye candy, ...  
2  My new favorite book from the Avon True Romanc...  
3  i love LJ Smith! best of the Night World books...  
4  Book of Chance The food is sexier than the sex...  

Saved merged iid-level metadata to: C:\Users\carlk\OneDrive\Documents\uoft\ECE1508H F\Project\csv_export\amazonbooks_metadata_merged_per_ii