In [3]:
import pandas as pd
import numpy as np
import os
import json

# ==========================================
# 1. 配置路径与参数
# ==========================================
class Args:
    raw_data_dir = '../raw_data'                 # 确保目录下有 movies.dat 和 ratings.dat
    processed_data_dir = '../cold_start_dataset' # 输出到新文件夹
    dataset = 'ml-1m'

args = Args()
dataset_dir = f"{args.processed_data_dir}/{args.dataset}"
os.makedirs(dataset_dir, exist_ok=True)

print(f"Start processing {args.dataset} for Cold-Start (User-Split)...")

# ==========================================
# 2. 辅助函数 (文本清洗 & 迭代过滤)
# ==========================================
def norm_text(x):
    """文本标准化，处理空值"""
    if x is None: return None
    if isinstance(x, float) and pd.isna(x): return None
    x = str(x).strip()
    bad_tokens = {"", ".", "-", "--", "...", "N/A", "n/a", "None", "unknown"}
    if x in bad_tokens: return None
    return x

def iterative_filter(df, min_user=5, min_item=5):
    """k-core 过滤：递归剔除交互太少的用户和物品"""
    print(f"Starting iterative filter (min_user={min_user}, min_item={min_item})...")
    changed = True
    while changed:
        changed = False
        # Filter items
        item_counts = df['asin'].value_counts()
        valid_items = item_counts[item_counts >= min_item].index
        new_df = df[df['asin'].isin(valid_items)]
        if len(new_df) != len(df): changed = True
        df = new_df

        # Filter users
        user_counts = df['reviewerID'].value_counts()
        valid_users = user_counts[user_counts >= min_user].index
        new_df = df[df['reviewerID'].isin(valid_users)]
        if len(new_df) != len(df): changed = True
        df = new_df

    print(f"Filter done. Rows: {len(df)}")
    return df.reset_index(drop=True)

# ==========================================
# 3. 加载数据 & 修复潜在报错
# ==========================================
print("Loading raw data...")

# ---- Load Movies (Meta) ----
try:
    movies = pd.read_csv(
        os.path.join(args.raw_data_dir, args.dataset, 'movies.dat'),
        sep='::', engine='python', encoding='latin-1',
        names=['asin', 'title', 'category']
    )
except Exception as e:
    raise FileNotFoundError(f"读取 movies.dat 失败: {e}")

movies['title'] = movies['title'].apply(norm_text)
movies['category'] = movies['category'].apply(norm_text)
movies['description'] = None
movies['asin'] = movies['asin'].astype(str)

# ---- Load Ratings (Interactions) ----
try:
    ratings = pd.read_csv(
        os.path.join(args.raw_data_dir, args.dataset, 'movies.dat'),
        sep='::', engine='python', encoding='latin-1',
        names=['reviewerID', 'asin', 'rating', 'unixReviewTime']
    )
except Exception as e:
    raise FileNotFoundError(f"读取 ratings.dat 失败: {e}")

ratings['reviewerID'] = ratings['reviewerID'].astype(str)
ratings['asin'] = ratings['asin'].astype(str)

# ★★★ 修复核心：安全转换时间戳，防止 NaN 导致的 IntCastingNaNError ★★★
ratings['unixReviewTime'] = pd.to_numeric(ratings['unixReviewTime'], errors='coerce')
# 如果有无法转化的行（NaN），直接丢弃
ratings = ratings.dropna(subset=['unixReviewTime'])
# 安全转为 int
ratings['unixReviewTime'] = ratings['unixReviewTime'].astype(int)

# 仅保留在 Meta 中存在的 Item
valid_asins_meta = set(movies['asin'].values)
ratings = ratings[ratings['asin'].isin(valid_asins_meta)]

# 执行 K-core 过滤
ratings = iterative_filter(ratings, min_user=5, min_item=5)

# ==========================================
# 4. ID Remapping (0 ~ N-1)
# ==========================================
print("Remapping IDs...")
unique_users = sorted(ratings['reviewerID'].unique())
unique_items = sorted(ratings['asin'].unique())

user2id = {u: i for i, u in enumerate(unique_users)}
id2user = {i: u for i, u in enumerate(unique_users)}
item2id = {a: i for i, a in enumerate(unique_items)}
id2item = {i: a for i, a in enumerate(unique_items)}

# 应用映射
df_inter = ratings.copy()
df_inter['user_id'] = df_inter['reviewerID'].map(user2id)
df_inter['item_id'] = df_inter['asin'].map(item2id)
df_inter = df_inter[['user_id', 'item_id', 'unixReviewTime']]
# 按用户和时间排序
df_inter = df_inter.sort_values(['user_id', 'unixReviewTime']).reset_index(drop=True)

# 保存映射文件
for name, data in [('user2id', user2id), ('id2user', id2user),
                   ('item2id', item2id), ('id2item', id2item)]:
    with open(f"{dataset_dir}/{name}.json", "w") as f:
        json.dump(data, f)

# 保存 interactions.csv
df_inter.to_csv(f"{dataset_dir}/interactions.csv", index=False)

# 保存 Item Text
df_meta = movies[movies['asin'].isin(set(ratings['asin'].unique()))].copy()
df_meta['item_id'] = df_meta['asin'].map(item2id)
df_meta = df_meta.dropna(subset=['item_id'])
df_meta['item_id'] = df_meta['item_id'].astype(int)
df_meta = df_meta[['item_id', 'title', 'description', 'category']]
df_meta = df_meta.sort_values('item_id').reset_index(drop=True)
df_meta.to_csv(f"{dataset_dir}/item_text.csv", index=False)
print("Base data processing done.")

# ============================================================
# 5. Cold-start Split (User Split by Time) - 核心替换部分
# ============================================================
print("Splitting datasets (Cold-start User Split)...")

# 1. 计算每个用户的最后一次交互时间
user_last_time = (
    df_inter.groupby("user_id")["unixReviewTime"]
    .max()
    .reset_index()
    .rename(columns={"unixReviewTime": "last_time"})
)

# 2. 按 last_time 从新到旧排序 (Newest Users first)
user_last_time = user_last_time.sort_values("last_time", ascending=False).reset_index(drop=True)

num_users_total = len(user_last_time)
n_test  = int(num_users_total * 0.1) # Top 10% users (Newest)
n_valid = int(num_users_total * 0.1) # Next 10% users

# 获取 User ID 集合
test_users_set  = set(user_last_time.iloc[:n_test].user_id)
valid_users_set = set(user_last_time.iloc[n_test:n_test+n_valid].user_id)
train_users_set = set(user_last_time.iloc[n_test+n_valid:].user_id)

print(f"Split Result -> Train Users: {len(train_users_set)}, Valid Users: {len(valid_users_set)}, Test Users: {len(test_users_set)}")

# 3. 构建 user -> sequence 字典
user2seq = {}
# 使用 itertuples 加速遍历
for r in df_inter.itertuples():
    user2seq.setdefault(r.user_id, []).append(r.item_id)

# 4. 生成数据集 DataFrame
def build_dataset(user_set, mode='train'):
    data_rows = []
    for user in user_set:
        seq = user2seq.get(user, [])
        # 至少要有 1个历史 + 1个Target，否则没法预测
        if len(seq) < 2:
            continue

        # Cold start split 通常取全量历史预测最后一个
        # Input: seq[:-1] (0 to T-1)
        # Target: seq[-1] (T)
        hist = seq[:-1]
        nxt = seq[-1]
        data_rows.append([user, hist, nxt, len(hist)])

    return pd.DataFrame(data_rows, columns=['user_id', 'seq', 'next', 'len_seq'])

train_df = build_dataset(train_users_set, mode='train')
valid_df = build_dataset(valid_users_set, mode='valid')
test_df  = build_dataset(test_users_set,  mode='test')

# 5. 保存 .df 文件
train_df.to_pickle(f"{dataset_dir}/train_data.df")
valid_df.to_pickle(f"{dataset_dir}/val_data.df")
test_df.to_pickle(f"{dataset_dir}/test_data.df")

print(f"Samples generated - Train: {len(train_df)}, Valid: {len(valid_df)}, Test: {len(test_df)}")

# ==========================================
# 6. 生成统计信息
# ==========================================
num_items = len(item2id)
total_interactions = len(df_inter)
seq_lens = [len(v) for v in user2seq.values()]
sparsity = 1 - (total_interactions / (num_users_total * num_items))

info = {
    "num_users": num_users_total,
    "num_items": num_items,
    "total_interactions": total_interactions,
    "avg_seq_len": round(np.mean(seq_lens), 3),
    "sparsity": round(sparsity, 6),
    "train_samples": len(train_df),
    "valid_samples": len(valid_df),
    "test_samples": len(test_df)
}

# 保存 info CSV
pd.DataFrame([info]).to_csv(f"{dataset_dir}/{args.dataset}_info.csv", index=False)

# 保存 data_statis.df (AlphaFuse/BetaFuse 代码可能需要)
pd.DataFrame([{"item_num": num_items, "user_num": num_users_total}]).to_pickle(f"{dataset_dir}/data_statis.df")

print("Processing Done!")
print(info)

Start processing ml-1m for Cold-Start (User-Split)...
Loading raw data...
Starting iterative filter (min_user=5, min_item=5)...
Filter done. Rows: 0
Remapping IDs...
Base data processing done.
Splitting datasets (Cold-start User Split)...
Split Result -> Train Users: 0, Valid Users: 0, Test Users: 0
Samples generated - Train: 0, Valid: 0, Test: 0


ZeroDivisionError: division by zero