Generate the corresponding Files about "MovieLens-1M"

In [5]:
import pandas as pd
import numpy as np
import os
import json

# ==========================================
# 1. 配置路径
# ==========================================
class Args:
    raw_data_dir = '../raw_data'            # 数据集文件所在目录
    processed_data_dir = '../dataset' # 输出目录
    dataset = 'ml-1m'             # 数据集名称

args = Args()
dataset_dir = f"{args.processed_data_dir}/{args.dataset}"
os.makedirs(dataset_dir, exist_ok=True)

print(f"Start processing {args.dataset}...")

# ==========================================
# 2. 辅助函数 (文本清洗 & 迭代过滤)
# ==========================================
def norm_text(x):
    """标准化文本，过滤无意义字符"""
    if x is None:
        return None
    if isinstance(x, float) and pd.isna(x):
        return None
    x = str(x).strip()
    bad_tokens = {"", ".", "-", "--", "...", "N/A", "n/a", "None", "unknown"}
    if x in bad_tokens:
        return None
    # 如果全是符号则过滤
    if all(not c.isalnum() for c in x):
        return None
    return x

def iterative_filter(df, min_user=5, min_item=5):
    """递归过滤掉交互过少的用户和物品 (k-core)"""
    changed = True
    while changed:
        changed = False
        # ---- filter items ----
        item_counts = df['asin'].value_counts()
        valid_items = item_counts[item_counts >= min_item].index
        new_df = df[df['asin'].isin(valid_items)]
        if len(new_df) != len(df):
            changed = True
        df = new_df

        # ---- filter users ----
        user_counts = df['reviewerID'].value_counts()
        valid_users = user_counts[user_counts >= min_user].index
        new_df = df[df['reviewerID'].isin(valid_users)]
        if len(new_df) != len(df):
            changed = True
        df = new_df
    return df.reset_index(drop=True)

# ==========================================
# 3. 加载数据 (Movies & Ratings)
# ==========================================
print("Loading data...")

# 读取 movies.dat (Item Meta)
# ML-1M format: MovieID::Title::Genres
movies = pd.read_csv(
    os.path.join(args.raw_data_dir, args.dataset, 'movies.dat'),
    sep='::', engine='python', encoding='latin-1',
    names=['asin', 'title', 'category']
)

# 清洗 Meta
movies['title'] = movies['title'].apply(norm_text)
movies['category'] = movies['category'].apply(norm_text) # 保留 Genres 字符串
movies['description'] = None # ML-1M 无描述
movies['asin'] = movies['asin'].astype(str) # 统一 ID 类型

# 读取 ratings.dat (Interactions)
# ML-1M format: UserID::MovieID::Rating::Timestamp
ratings = pd.read_csv(
    os.path.join(args.raw_data_dir, args.dataset, 'ratings.dat'),
    sep='::', engine='python', encoding='latin-1',
    names=['reviewerID', 'asin', 'rating', 'unixReviewTime']
)
ratings['reviewerID'] = ratings['reviewerID'].astype(str)
ratings['asin'] = ratings['asin'].astype(str)
ratings['unixReviewTime'] = ratings['unixReviewTime'].astype(int)

# 初步过滤：只保留 Meta 中存在的 Item
valid_asins = set(movies['asin'].values)
ratings = ratings[ratings['asin'].isin(valid_asins)]

print(f"Raw ratings loaded: {len(ratings)}")

# ==========================================
# 4. 执行过滤与 ID 映射
# ==========================================
print("Applying iterative filter...")
data_review_clean = iterative_filter(ratings, min_user=5, min_item=5)
print(f"Ratings after filter: {len(data_review_clean)}")

# 构建 ID 映射 (0 ~ N-1)
unique_users = sorted(data_review_clean['reviewerID'].unique())
unique_items = sorted(data_review_clean['asin'].unique())

user2id = {u: i for i, u in enumerate(unique_users)}
id2user = {i: u for i, u in enumerate(unique_users)}
item2id = {a: i for i, a in enumerate(unique_items)}
id2item = {i: a for i, a in enumerate(unique_items)}

# 应用映射
df_inter = data_review_clean.copy()
df_inter['user_id'] = df_inter['reviewerID'].map(user2id)
df_inter['item_id'] = df_inter['asin'].map(item2id)
df_inter = df_inter[['user_id', 'item_id', 'unixReviewTime']]
# 按用户和时间排序
df_inter = df_inter.sort_values(['user_id', 'unixReviewTime']).reset_index(drop=True)

# 保存映射文件
for name, data in [('user2id', user2id), ('id2user', id2user),
                   ('item2id', item2id), ('id2item', id2item)]:
    with open(f"{dataset_dir}/{name}.json", "w") as f:
        json.dump(data, f)

# 保存 interactions.csv
df_inter.to_csv(f"{dataset_dir}/interactions.csv", index=False)

# ==========================================
# 5. 保存处理后的 Item Meta (item_text.csv)
# ==========================================
final_valid_items = set(data_review_clean['asin'].unique())
df_meta_clean = movies[movies['asin'].isin(final_valid_items)].copy()

df_meta_clean['item_id'] = df_meta_clean['asin'].map(item2id)
df_meta_clean = df_meta_clean.dropna(subset=['item_id'])
df_meta_clean['item_id'] = df_meta_clean['item_id'].astype(int)

# 整理列顺序
df_meta_clean = df_meta_clean[['item_id', 'title', 'description', 'category']]
df_meta_clean = df_meta_clean.sort_values('item_id').reset_index(drop=True)

df_meta_clean.to_csv(f"{dataset_dir}/item_text.csv", index=False)
print("Saved item_text.csv")

# ==========================================
# 6. 数据集切分 (Leave-One-Out)
# ==========================================
print("Splitting dataset (LOO)...")
user2seq = {}
for r in df_inter.itertuples():
    user2seq.setdefault(r.user_id, []).append(r.item_id)

train_rows, valid_rows, test_rows = [], [], []

for user, seq in user2seq.items():
    if len(seq) < 4: continue # 序列过短跳过

    # Train: ... -> T-3 (Target: T-2)
    train_rows.append([user, seq[:-3], seq[-3], len(seq[:-3])])
    # Valid: ... -> T-2 (Target: T-1)
    valid_rows.append([user, seq[:-2], seq[-2], len(seq[:-2])])
    # Test:  ... -> T-1 (Target: T)
    test_rows.append([user, seq[:-1], seq[-1], len(seq[:-1])])

# 转换为 DataFrame
cols = ['user_id', 'seq', 'next', 'len_seq']
train_df = pd.DataFrame(train_rows, columns=cols)
valid_df = pd.DataFrame(valid_rows, columns=cols)
test_df  = pd.DataFrame(test_rows,  columns=cols)

# 保存 .df (pickle)
train_df.to_pickle(f"{dataset_dir}/train_data.df")
valid_df.to_pickle(f"{dataset_dir}/val_data.df")
test_df.to_pickle(f"{dataset_dir}/test_data.df")

# ==========================================
# 7. 生成统计信息 (_info.csv & data_statis.df)
# ==========================================
num_users = len(user2id)
num_items = len(item2id)
total_interactions = len(df_inter)
sparsity = 1 - (total_interactions / (num_users * num_items))

info = {
    "num_users": num_users,
    "num_items": num_items,
    "total_interactions": total_interactions,
    "sparsity": sparsity,
    "train_samples": len(train_df),
    "valid_samples": len(valid_df),
    "test_samples": len(test_df)
}

pd.DataFrame([info]).to_csv(f"{dataset_dir}/{args.dataset}_info.csv", index=False)
pd.DataFrame([{"item_num": num_items, "user_num": num_users}]).to_pickle(f"{dataset_dir}/data_statis.df")

print("Processing Done!")
print(info)

Start processing ml-1m...
Loading data...
Raw ratings loaded: 1000209
Applying iterative filter...
Ratings after filter: 999611
Saved item_text.csv
Splitting dataset (LOO)...
Processing Done!
{'num_users': 6040, 'num_items': 3416, 'total_interactions': 999611, 'sparsity': 0.9515519584503, 'train_samples': 6040, 'valid_samples': 6040, 'test_samples': 6040}
