### 1. 先处理meta_{data}.json.gz文件，读取并清洗后，得到df_meta 也就是item table

In [1]:
import json
import gzip

import numpy as np
import pandas as pd
from datetime import datetime

start_time = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

class Args:
    raw_data_dir = '../raw_data'
    processed_data_dir = '../cold_start_dataset'
    dataset = 'Electronics'
# ['Sports_and_Outdoors', 'Beauty', 'Toys_and_Games', 'Yelp', 'Electronics']

args = Args()

print(f'Start preprocessing {args.dataset} at {start_time}...')
# 第一步先处理meta_{data}.json.gz
#   "asin": "0000031852",
#   "title": "Girls Ballet Tutu Zebra Hot Pink",
#   "price": 3.17,
#   "imUrl": "http://ecx.images-amazon.com/images/I/51fAmVkTbyL._SY300_.jpg",
#   "related":
#   {
#     "also_bought": ["B00D103F8U", "B007R2RM8W"],
#     "also_viewed": ["B00E79VW6Q", "B00D10CLVW", "B00B0AVO54", "B00E95LC8Q", "B00GOR92SO", "B007ZN5Y56", "B00AL2569W", "B00B608000", "B008F0SMUC", "B00BFXLZ8M"],
#     "bought_together": ["B002BZX8Z6"]
#   },
#   "salesRank": {"Toys & Games": 211836},
#   "brand": "Coxlures",
#   "categories": [["Sports & Outdoors", "Other Sports", "Dance"]]

filename = f"{args.raw_data_dir}/{args.dataset}/meta_{args.dataset}.json.gz"

def parse(path):
    with gzip.open(path, 'rb') as g:
        for l in g:
            yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF(filename)


Start preprocessing Electronics at 2026-01-25-18-55-56...


In [2]:
df.head()

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand
0,132793040,http://ecx.images-amazon.com/images/I/31JIPhp%...,The Kelby Training DVD Mastering Blend Modes i...,"[[Electronics, Computers & Accessories, Cables...",Kelby Training DVD: Mastering Blend Modes in A...,,,,
1,321732944,http://ecx.images-amazon.com/images/I/31uogm6Y...,,"[[Electronics, Computers & Accessories, Cables...",Kelby Training DVD: Adobe Photoshop CS5 Crash ...,,,,
2,439886341,http://ecx.images-amazon.com/images/I/51k0qa8f...,Digital Organizer and Messenger,"[[Electronics, Computers & Accessories, PDAs, ...",Digital Organizer and Messenger,8.15,{'Electronics': 144944},"{'also_viewed': ['0545016266', 'B009ECM8QY', '...",
3,511189877,http://ecx.images-amazon.com/images/I/41HaAhbv...,The CLIKR-5 UR5U-8780L remote control is desig...,"[[Electronics, Accessories & Supplies, Audio &...",CLIKR-5 Time Warner Cable Remote Control UR5U-...,23.36,,"{'also_viewed': ['B001KC08A4', 'B00KUL8O0W', '...",
4,528881469,http://ecx.images-amazon.com/images/I/51FnRkJq...,"Like its award-winning predecessor, the Intell...","[[Electronics, GPS & Navigation, Vehicle GPS, ...",Rand McNally 528881469 7-inch Intelliroute TND...,299.99,,"{'also_viewed': ['B006ZOI9OY', 'B00C7FKT2A', '...",


In [3]:
import numpy as np
import pandas as pd

# 只取必要字段
df_meta = df[['asin', 'title', 'description', 'categories']].copy()

# ============================================================
# 1. 标准化文本（过滤空白、无意义内容）
# ============================================================
def norm_text(x):
    if x is None:
        return None
    if isinstance(x, float) and pd.isna(x):
        return None

    x = str(x).strip()

    # 无意义文本过滤
    bad_tokens = {
        "", ".", "-", "--", "...",
        "N/A", "n/a", "N.a.", "None", "none",
        "Unknown", "unknown",
        "See description", "see description",
        "No description"
    }
    if x in bad_tokens:
        return None

    # 全符号（无语义）
    if all(not c.isalnum() for c in x):
        return None

    return x


# ============================================================
# 2. 清洗 title
# ============================================================
df_meta['title'] = df_meta['title'].apply(norm_text)
df_meta['title_len'] = df_meta['title'].apply(lambda x: len(x) if isinstance(x, str) else 0)

# 标题太短（<3字）视为噪声
df_meta.loc[df_meta['title_len'] < 3, 'title'] = None


# ============================================================
# 3. 清洗 description
# ============================================================
def clean_description(desc):
    if isinstance(desc, str):
        return norm_text(desc)

    if isinstance(desc, list):
        toks = [norm_text(str(t)) for t in desc if norm_text(str(t))]
        return " ".join(toks) if len(toks) > 0 else None

    if isinstance(desc, dict):
        flat = " ".join([f"{k}:{v}" for k, v in desc.items()])
        return norm_text(flat)

    return None


df_meta['description'] = df_meta['description'].apply(clean_description)
df_meta['desc_len'] = df_meta['description'].apply(lambda x: len(x) if isinstance(x, str) else 0)

# 过滤极短文本（<5）
df_meta.loc[df_meta['desc_len'] < 5, 'description'] = None


# ============================================================
# 4. 清洗 category（必须存在）
# ============================================================
def clean_category(cats):
    if isinstance(cats, list) and len(cats) > 0:
        last_path = cats[-1]
        if isinstance(last_path, list):
            cleaned = [norm_text(t) for t in last_path if norm_text(t)]
            if len(cleaned) > 0:
                return cleaned[-1]
    return None


df_meta['category'] = df_meta['categories'].apply(clean_category)

# category 是硬约束
df_meta = df_meta.dropna(subset=['category'])

# ============================================================
# 5. title 和 description 至少一个存在
# ============================================================
df_meta = df_meta[(df_meta['title'].notna()) | (df_meta['description'].notna())]

# 最终结构化 df_meta（保留 asin/title/description/category）
df_meta = df_meta[['asin', 'title', 'description', 'category']].reset_index(drop=True)

print("Clean df_meta rows:", len(df_meta))
df_meta.head()

Clean df_meta rows: 496450


Unnamed: 0,asin,title,description,category
0,132793040,Kelby Training DVD: Mastering Blend Modes in A...,The Kelby Training DVD Mastering Blend Modes i...,Monitor Accessories
1,321732944,Kelby Training DVD: Adobe Photoshop CS5 Crash ...,,Monitor Accessories
2,439886341,Digital Organizer and Messenger,Digital Organizer and Messenger,PDAs & Handhelds
3,511189877,CLIKR-5 Time Warner Cable Remote Control UR5U-...,The CLIKR-5 UR5U-8780L remote control is desig...,TV Remote Controls
4,528881469,Rand McNally 528881469 7-inch Intelliroute TND...,"Like its award-winning predecessor, the Intell...",Trucking GPS


### 2. 接下来处理review_{data}_5.json.gz
需要考虑的是，review_{data}_5.json.gz中可能包含已经不在item table中的item，所以要进行清洗

In [4]:
# 接下来处理review_{data}_5
#   "reviewerID": "A2SUAM1J3GNN3B",
#   "asin": "0000013714",
#   "reviewerName": "J. McDonald",
#   "helpful": [2, 3],
#   "reviewText": "I bought this for my husband who plays the piano.  He is having a wonderful time playing these old hymns.  The music  is at times hard to read because we think the book was published for singing from more than playing from.  Great purchase though!",
#   "overall": 5.0,
#   "summary": "Heavenly Highway Hymns",
#   "unixReviewTime": 1252800000,
#   "reviewTime": "09 13, 2009"

import gzip
import numpy as np
import pandas as pd
import json

def parse_review(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def get_review_DF(path):
  i = 0
  df = {}
  for d in parse_review(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

review_file_name = f"{args.raw_data_dir}/{args.dataset}/reviews_{args.dataset}_5.json.gz"

data_review = get_review_DF(review_file_name)

print("Review raw data length:", len(data_review))

Review raw data length: 1689188


In [5]:
data_review.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AO94DHGC771SJ,528881469,amazdnu,"[0, 0]",We got this GPS for my husband who is an (OTR)...,5.0,Gotta have GPS!,1370131200,"06 2, 2013"
1,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010"
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010"
3,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2.0,"Great grafics, POOR GPS",1290556800,"11 24, 2010"
4,A24EV6RXELQZ63,528881469,Wayne Smith,"[0, 0]",I've had mine for a year and here's what we go...,1.0,"Major issues, only excuses for support",1317254400,"09 29, 2011"


In [6]:

data_review_copy = data_review[['reviewerID', 'asin', 'unixReviewTime']].copy()

data_review_copy['unixReviewTime'] = data_review_copy['unixReviewTime'].astype(int)

# 保留 metadata 中存在的 asin
valid_asin_id = set(df_meta['asin'].values)
data_review_copy = data_review_copy[data_review_copy['asin'].isin(valid_asin_id)]

# 时间戳必须大于 0
data_review_copy = data_review_copy[data_review_copy['unixReviewTime'] > 0]

data_review_copy = data_review_copy.reset_index(drop=True)

print("Review data after merging with metadata:", len(data_review_copy))


Review data after merging with metadata: 1667219


In [7]:
# 反复清洗review_{data}_5.json.gz；直至 >= 5稳定后再重映射
def iterative_filter(df, min_user=5, min_item=5):
    changed = True
    while changed:
        changed = False

        # ---- filter items ----
        item_counts = df['asin'].value_counts()
        valid_items = item_counts[item_counts >= min_item].index
        new_df = df[df['asin'].isin(valid_items)]
        if len(new_df) != len(df):
            changed = True
        df = new_df

        # ---- filter users ----
        user_counts = df['reviewerID'].value_counts()
        valid_users = user_counts[user_counts >= min_user].index
        new_df = df[df['reviewerID'].isin(valid_users)]
        if len(new_df) != len(df):
            changed = True
        df = new_df

    return df.reset_index(drop=True)

data_review_clean = iterative_filter(data_review_copy)
print("After iterative filtering:", len(data_review_clean))

After iterative filtering: 1649975


In [8]:
len(df_meta)

496450

In [9]:
# 清洗完review_{data}_5.json.gz后需要对item table进行筛选
valid_asins = set(data_review_clean['asin'].unique())
df_meta_clean = df_meta[df_meta['asin'].isin(valid_asins)]

print('After cleaning get df_meta_clean length:', len(df_meta_clean))


After cleaning get df_meta_clean length: 62237


### 3. user/item重映射，并按userID-unixReviewTime进行排序

In [10]:
import json
import os

dataset_dir = f"{args.processed_data_dir}/{args.dataset}"
os.makedirs(dataset_dir, exist_ok=True)

df_ = data_review_clean

# -----------------------------
# Step 1: 获取唯一用户和物品
# -----------------------------
unique_users = sorted(df_['reviewerID'].unique())
unique_items = sorted(df_['asin'].unique())

# -----------------------------
# Step 2: 构建映射字典（为了配合本项目中backbone_SASRec的代码逻辑“padding_idx=item_num”，所以itemID必须从 0 开始计数）
# -----------------------------
user2id = {u: i for i, u in enumerate(unique_users)}
id2user = {i: u for i, u in enumerate(unique_users)}

item2id = {a: i for i, a in enumerate(unique_items)}
id2item = {i: a for i, a in enumerate(unique_items)}

# -----------------------------
# Step 3: 应用映射到交互表
# -----------------------------
df_['user_id'] = df_['reviewerID'].map(user2id)
df_['item_id'] = df_['asin'].map(item2id)

df_ = df_[['user_id', 'item_id', 'unixReviewTime']]
df_ = df_.sort_values(['user_id', 'unixReviewTime']).reset_index(drop=True)

# -----------------------------
# Step 4: 保存映射文件
# -----------------------------
with open(f"{dataset_dir}/user2id.json", "w") as f:
    json.dump(user2id, f)

with open(f"{dataset_dir}/id2user.json", "w") as f:
    json.dump(id2user, f)

with open(f"{dataset_dir}/item2id.json", "w") as f:
    json.dump(item2id, f)

with open(f"{dataset_dir}/id2item.json", "w") as f:
    json.dump(id2item, f)

# -----------------------------
# Step 5: 保存重映射后的交互表
# -----------------------------
df_.to_csv(f"{dataset_dir}/interactions.csv", index=False)

print("Remapping done!")
print("#Users =", len(unique_users))
print("#Items =", len(unique_items))
print("Mapped interactions saved to:", f"{dataset_dir}/interactions.csv")


Remapping done!
#Users = 188305
#Items = 62237
Mapped interactions saved to: ../cold_start_dataset/Electronics/interactions.csv


In [11]:
# ============================================================
# 重新基于 valid_asins 进一步过滤 meta（不会覆盖清洗结果）
# ============================================================

df_meta_clean = df_meta[df_meta['asin'].isin(valid_asins)].copy()

print("meta rows after valid_asins filter:", len(df_meta_clean))

# 添加 item_id
df_meta_clean['item_id'] = df_meta_clean['asin'].map(item2id)

# 丢弃 mapping 失败的
df_meta_clean = df_meta_clean.dropna(subset=['item_id'])
df_meta_clean['item_id'] = df_meta_clean['item_id'].astype(int)

# 最终列
df_meta_clean = df_meta_clean[['item_id', 'title', 'description', 'category']]
df_meta_clean = df_meta_clean.reset_index(drop=True)

# 保存 item_text.csv
df_meta_clean.to_csv(f"{dataset_dir}/item_text.csv", index=False)

print("Saved structured item_text.csv. Final rows:", len(df_meta_clean))
df_meta_clean.head()


meta rows after valid_asins filter: 62237
Saved structured item_text.csv. Final rows: 62237


Unnamed: 0,item_id,title,description,category
0,0,Rand McNally 528881469 7-inch Intelliroute TND...,"Like its award-winning predecessor, the Intell...",Trucking GPS
1,1,Barnes &amp; Noble HDTV Adapter Kit for NOOK H...,HDTV Adapter Kit for NOOK HD and NOOK HD+\nThi...,Chargers & Adapters
2,2,Barnes &amp; Noble OV/HB-ADP Universal Power Kit,Power up your device with this Barnes &amp; No...,Power Adapters
3,3,VideoSecu 24&quot; Long Arm TV Wall Mount Low ...,The VideoSecu TV mount is a mounting solution ...,TV Ceiling & Wall Mounts
4,7,Barnes &amp; Noble Nook eReader - no 3G,Barnes & Noble Nook eReader - no 3GMeet nook. ...,eBook Readers & Accessories


In [12]:
dataset_dir = f"{args.processed_data_dir}/{args.dataset}"
inter_path = f"{dataset_dir}/interactions.csv"

inter_df = pd.read_csv(inter_path)
inter_df.head()

Unnamed: 0,user_id,item_id,unixReviewTime
0,0,13112,1400457600
1,0,17896,1400457600
2,0,28166,1400457600
3,0,29078,1400457600
4,0,61527,1400457600


### 4. 切分数据集并生成数据集信息

In [13]:
# ============================================================
# 4. Cold-start split by USER last interaction time (8:1:1)
# ============================================================

import pandas as pd
import numpy as np
import os, json

dataset_dir = f"{args.processed_data_dir}/{args.dataset}"
inter_path = f"{dataset_dir}/interactions.csv"

inter_df = pd.read_csv(inter_path)

# ------------------------------------------------
# Step 1: 计算每个用户的最后一次交互时间
# ------------------------------------------------
user_last_time = (
    inter_df.groupby("user_id")["unixReviewTime"]
    .max()
    .reset_index()
    .rename(columns={"unixReviewTime": "last_time"})
)

# 按 last_time 从新到旧排序
user_last_time = user_last_time.sort_values(
    "last_time", ascending=False
).reset_index(drop=True)

num_users = len(user_last_time)
n_test  = int(num_users * 0.1)
n_valid = int(num_users * 0.1)

test_users  = set(user_last_time.iloc[:n_test].user_id)
valid_users = set(user_last_time.iloc[n_test:n_test+n_valid].user_id)
train_users = set(user_last_time.iloc[n_test+n_valid:].user_id)

print("Cold-start split:")
print("train users:", len(train_users))
print("valid users:", len(valid_users))
print("test users :", len(test_users))

# ------------------------------------------------
# Step 2: 构建 user → sequence（仍然按时间排序）
# ------------------------------------------------
inter_df = inter_df.sort_values(["user_id", "unixReviewTime"])

user2seq = {}
for r in inter_df.itertuples():
    user2seq.setdefault(r.user_id, []).append(r.item_id)

train_rows, valid_rows, test_rows = [], [], []

def build_rows(user_set, target_rows):
    for user in user_set:
        seq = user2seq.get(user, [])
        if len(seq) < 2:
            continue
        hist = seq[:-1]
        nxt = seq[-1]
        target_rows.append([user, hist, nxt, len(hist)])

build_rows(train_users, train_rows)
build_rows(valid_users, valid_rows)
build_rows(test_users,  test_rows)

train_df = pd.DataFrame(train_rows, columns=["user_id", "seq", "next", "len_seq"])
valid_df = pd.DataFrame(valid_rows, columns=["user_id", "seq", "next", "len_seq"])
test_df  = pd.DataFrame(test_rows,  columns=["user_id", "seq", "next", "len_seq"])

# ------------------------------------------------
# Step 3: 保存文件（格式与附件一完全一致）
# ------------------------------------------------
train_df.to_pickle(f"{dataset_dir}/train_data.df")
valid_df.to_pickle(f"{dataset_dir}/val_data.df")
test_df.to_pickle(f"{dataset_dir}/test_data.df")

print("Samples:")
print("train:", len(train_df))
print("valid:", len(valid_df))
print("test :", len(test_df))

# ------------------------------------------------
# Step 4: dataset_info.csv & data_statis.df
# ------------------------------------------------
num_items = inter_df.item_id.nunique()
total_interactions = len(inter_df)

seq_lens = [len(v) for v in user2seq.values()]
sparsity = 1 - (total_interactions / (num_users * num_items))

dataset_info = {
    "num_users": num_users,
    "num_items": num_items,
    "total_interactions": total_interactions,
    "avg_seq_len": round(np.mean(seq_lens), 3),
    "median_seq_len": np.median(seq_lens),
    "min_seq_len": min(seq_lens),
    "max_seq_len": max(seq_lens),
    "sparsity": round(sparsity, 6),
    "train_samples": len(train_df),
    "valid_samples": len(valid_df),
    "test_samples": len(test_df),
}

pd.DataFrame([dataset_info]).to_csv(
    f"{dataset_dir}/{args.dataset}_info.csv", index=False
)

stat = {
    "item_num": num_items,
    "user_num": num_users
}
pd.DataFrame([stat]).to_pickle(f"{dataset_dir}/data_statis.df")

print("Cold-start dataset processing finished.")

Cold-start split:
train users: 150645
valid users: 18830
test users : 18830
Samples:
train: 150645
valid: 18830
test : 18830
Cold-start dataset processing finished.


In [14]:
print(dataset_info)

{'num_users': 188305, 'num_items': 62237, 'total_interactions': 1649975, 'avg_seq_len': 8.762, 'median_seq_len': 7.0, 'min_seq_len': 5, 'max_seq_len': 405, 'sparsity': 0.999859, 'train_samples': 150645, 'valid_samples': 18830, 'test_samples': 18830}
