In [3]:
import pandas as pd
import numpy as np
from collections import defaultdict
import os

def build_index(df):
    """
    为 user、item 和 category 创建索引映射
    """
    user_ids = sorted(df['user_id'].unique())
    item_ids = sorted(df['item_id'].unique())
    categories = sorted(df['category_id'].unique())
    
    user_index = {uid: i for i, uid in enumerate(user_ids)}
    item_index = {iid: i for i, iid in enumerate(item_ids)}
    category_index = {cid: i for i, cid in enumerate(categories)}
    
    print("User Index Sample:", list(user_index.items())[:10])
    print("Item Index Sample:", list(item_index.items())[:10])
    print("Category Index Sample:", list(category_index.items())[:10])
    
    return user_index, item_index, category_index

def apply_index_mapping(df, user_map, item_map, category_map):
    """
    使用索引映射替换原数据中的 user_id、item_id 和 category_id
    """
    df['user_id'] = df['user_id'].map(user_map)
    df['item_id'] = df['item_id'].map(item_map)
    df['category_id'] = df['category_id'].map(category_map)
    return df

def process_behavior_data(file_path, output_dir):
    """
    处理行为数据，生成按用户-物品交互划分的训练集和测试集，并保存索引映射
    """
    # 读取数据
    df = pd.read_csv(file_path, header=None, 
                     names=['user_id', 'item_id', 'category_id', 'behavior_type', 'timestamp'])
    
    # 生成索引映射
    user_map, item_map, category_map = build_index(df)
    
    # 替换数据中的 user_id、item_id 和 category_id
    df = apply_index_mapping(df, user_map, item_map, category_map)
    
    # 保存索引映射
    pd.DataFrame(user_map.items(), columns=['original_id', 'new_id']).to_csv(f"{output_dir}/user_index.csv", index=False)
    pd.DataFrame(item_map.items(), columns=['original_id', 'new_id']).to_csv(f"{output_dir}/item_index.csv", index=False)
    pd.DataFrame(category_map.items(), columns=['original_id', 'new_id']).to_csv(f"{output_dir}/category_index.csv", index=False)
    
    # 获取商品-种类对应关系
    item_category_pairs = df[['item_id', 'category_id']].drop_duplicates()
    
    # 按行为类型处理数据
    behavior_groups = {}
    
    # 对数据进行去重和排序
    df = df.drop_duplicates(subset=['user_id', 'item_id', 'behavior_type'])
    df = df.sort_values(['user_id', 'behavior_type', 'timestamp'])
    
    
    # 按用户和行为类型分组
    grouped = df.groupby(['user_id', 'behavior_type'])
    
    train_data = defaultdict(list)
    test_data = defaultdict(list)
    
    for (user_id, behavior_type), group in grouped:
        # 获取该用户该行为类型的所有商品ID
        items = group['item_id'].tolist()
        
        if len(items) > 0:
            # 最后一个作为测试集
            test_item = items[-1]
            # 其余的作为训练集
            train_items = items[:-1]
            
            if train_items:  # 如果有训练数据
                train_str = f"{user_id} {' '.join(map(str, train_items))}"
                train_data[behavior_type].append(train_str)
            if test_item:  # 如果有测试数据
                test_str = f"{user_id} {str(test_item)}"
                test_data[behavior_type].append(test_str)
    
    behavior_groups = {
        behavior_type: {
            'train': train_data[behavior_type],
            'test': test_data[behavior_type]
        }
        for behavior_type in range(1, 5)
    }
    
    return behavior_groups, item_category_pairs

def save_data_files(behavior_groups,item_category_pairs,output_dir):
    """
    保存处理后的数据文件
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # 保存商品-种类对应关系
    item_category_path = f"{output_dir}/item_category.txt"
    with open(item_category_path, 'w') as f:
        for _, row in item_category_pairs.iterrows():
            f.write(f"{row['item_id']} {row['category_id']}\n")
    print(f"Saved: {item_category_path}")
    
    # 保存行为数据
    for behavior_type, data in behavior_groups.items():
        train_path = f"{output_dir}/train_behavior_{behavior_type}.txt"
        test_path = f"{output_dir}/test_behavior_{behavior_type}.txt"
        
        with open(train_path, 'w') as f:
            for line in data['train']:
                parts = line.split()
                formatted_line = ' '.join(parts)
                f.write(formatted_line + '\n')
                
        with open(test_path, 'w') as f:
            for line in data['test']:
                parts = line.split()
                formatted_line = ' '.join(parts)
                f.write(formatted_line + '\n')
                
        print(f"Saved: {train_path}, {test_path}")


def main():
    np.random.seed(42)
    input_file = "/home/dongzhi/毕设/data/filtered_data.csv"
    output_dir = "/home/dongzhi/毕设/model/mine/mine_data/ultragcn_data"
    behavior_groups, item_category_pairs = process_behavior_data(input_file, output_dir)
    save_data_files(behavior_groups, item_category_pairs, output_dir)

if __name__ == "__main__":
    main()


User Index Sample: [(41, 0), (43, 1), (343, 2), (438, 3), (470, 4), (481, 5), (650, 6), (712, 7), (767, 8), (927, 9)]
Item Index Sample: [(72, 0), (81, 1), (116, 2), (284, 3), (285, 4), (324, 5), (433, 6), (751, 7), (764, 8), (921, 9)]
Category Index Sample: [(2171, 0), (5064, 1), (7769, 2), (8254, 3), (8409, 4), (11120, 5), (14900, 6), (16219, 7), (21059, 8), (22129, 9)]
Saved: /home/dongzhi/毕设/model/mine/mine_data/ultragcn_data/item_category.txt
Saved: /home/dongzhi/毕设/model/mine/mine_data/ultragcn_data/train_behavior_1.txt, /home/dongzhi/毕设/model/mine/mine_data/ultragcn_data/test_behavior_1.txt
Saved: /home/dongzhi/毕设/model/mine/mine_data/ultragcn_data/train_behavior_2.txt, /home/dongzhi/毕设/model/mine/mine_data/ultragcn_data/test_behavior_2.txt
Saved: /home/dongzhi/毕设/model/mine/mine_data/ultragcn_data/train_behavior_3.txt, /home/dongzhi/毕设/model/mine/mine_data/ultragcn_data/test_behavior_3.txt
Saved: /home/dongzhi/毕设/model/mine/mine_data/ultragcn_data/train_behavior_4.txt, /home/do

In [7]:
# 假设原始的 items 列表如下：
items = [
    {'item_id': 101, 'timestamp': 1000},
    {'item_id': 102, 'timestamp': 1001},
    {'item_id': 103, 'timestamp': 1002}
]

train_items = [item['item_id'] for item in items[:-1]] 
print(train_items)

[101, 102]
