In [None]:
import json
import pandas as pd
from tqdm import tqdm
import os

In [None]:
# 配置
DATASET = 'Grocery_and_Gourmet_Food'
DATA_DIR = '../data'
MIN_INTERACTIONS = 5  # 最少交互次数

def main():
    print("="*60)
    print("Amazon 数据预处理")
    print("="*60)
    
    # 文件路径
    review_file = os.path.join(DATA_DIR, f'{DATASET}.jsonl')
    meta_file = os.path.join(DATA_DIR, f'meta_{DATASET}.jsonl')
    output_csv = os.path.join(DATA_DIR, f'{DATASET}.csv')
    output_caption = os.path.join(DATA_DIR, f'{DATASET}_caption.json')
    
    # 检查文件
    if not os.path.exists(review_file):
        print(f"❌ 找不到: {review_file}")
        return
    if not os.path.exists(meta_file):
        print(f"❌ 找不到: {meta_file}")
        return
    
    # 步骤1: 读取reviews
    print("\n[1/5] 读取reviews...")
    reviews = []
    with open(review_file, 'r') as f:
        for line in tqdm(f):
            data = json.loads(line.strip())
            reviews.append({
                'user_id': data.get('user_id', ''),
                'item_id': data.get('parent_asin', data.get('asin', '')),
                'timestamp': data.get('timestamp', 0)
            })
    
    df = pd.DataFrame(reviews)
    print(f"   原始交互: {len(df)}")
    
    # 步骤2: 5-core过滤
    print("\n[2/5] 过滤数据 (5-core)...")
    while True:
        user_counts = df['user_id'].value_counts()
        item_counts = df['item_id'].value_counts()
        
        df = df[df['user_id'].isin(user_counts[user_counts >= MIN_INTERACTIONS].index)]
        df = df[df['item_id'].isin(item_counts[item_counts >= MIN_INTERACTIONS].index)]
        
        if (df['user_id'].value_counts() < MIN_INTERACTIONS).sum() == 0 and \
           (df['item_id'].value_counts() < MIN_INTERACTIONS).sum() == 0:
            break
    
    print(f"   过滤后交互: {len(df)}")
    print(f"   用户数: {df['user_id'].nunique()}")
    print(f"   物品数: {df['item_id'].nunique()}")
    
    # 步骤3: 重新编码ID
    print("\n[3/5] 重新编码ID...")
    user_map = {old: new for new, old in enumerate(sorted(df['user_id'].unique()))}
    item_map = {old: new for new, old in enumerate(sorted(df['item_id'].unique()))}
    item_reverse = {v: k for k, v in item_map.items()}
    
    df['user_id'] = df['user_id'].map(user_map)
    df['item_id'] = df['item_id'].map(item_map)
    
    # 步骤4: 排序并保存CSV
    print("\n[4/5] 保存交互序列...")
    df = df.sort_values(['user_id', 'timestamp'])
    df[['user_id', 'item_id']].to_csv(output_csv, header=False, index=False)
    print(f"   ✅ {output_csv}")
    
    # 步骤5: 读取meta并保存caption
    print("\n[5/5] 提取物品标题...")
    meta = {}
    with open(meta_file, 'r') as f:
        for line in tqdm(f):
            item = json.loads(line.strip())
            item_id = item.get('parent_asin', item.get('asin', ''))
            if item_id:
                meta[item_id] = item.get('title', 'Unknown Product')
    
    captions = {}
    for new_id, old_id in item_reverse.items():
        captions[str(new_id)] = meta.get(old_id, 'Unknown Product')
    
    with open(output_caption, 'w') as f:
        json.dump(captions, f, ensure_ascii=False, indent=2)
    print(f"   ✅ {output_caption}")
    
    # 统计信息
    print("\n" + "="*60)
    print("完成！数据统计:")
    print(f"  用户数: {df['user_id'].nunique()}")
    print(f"  物品数: {df['item_id'].nunique()}")
    print(f"  交互数: {len(df)}")
    print(f"  平均每用户交互: {len(df) / df['user_id'].nunique():.1f}")
    print("="*60)

if __name__ == '__main__':
    main()

Amazon 数据预处理

[1/5] 读取reviews...


14318520it [00:39, 359547.96it/s]


   原始交互: 14318520

[2/5] 过滤数据 (5-core)...
   过滤后交互: 4125640
   用户数: 419876
   物品数: 135194

[3/5] 重新编码ID...

[4/5] 保存交互序列...
   ✅ ../data/Grocery_and_Gourmet_Food.csv

[5/5] 提取物品标题...


603274it [00:05, 101309.95it/s]


   ✅ ../data/Grocery_and_Gourmet_Food_caption.json

完成！数据统计:
  用户数: 419876
  物品数: 135194
  交互数: 4125640
  平均每用户交互: 9.8
