生成数据，按顺序进行

In [1]:
import pandas as pd

# 读取数据文件
data = pd.read_csv("./train.csv")

def are_strictly_ordered(user1_scores, user2_scores):
    """
    检查 user1 和 user2 是否满足强排序关系。
    """
    # 找出共同的 item_id
    common_items = set(user1_scores["item_id"]).intersection(set(user2_scores["item_id"]))
    if not common_items:
        return False  # 如果没有共同题目，强排序关系不存在

    # 对共同题目进行比较
    user1_common_scores = user1_scores[user1_scores["item_id"].isin(common_items)].set_index("item_id")["score"]
    user2_common_scores = user2_scores[user2_scores["item_id"].isin(common_items)].set_index("item_id")["score"]

    user1_common_scores, user2_common_scores = user1_common_scores.align(user2_common_scores, join="inner")

    # 检查是否满足所有分数都 >= 或 <= 的条件
    return all(user1_common_scores >= user2_common_scores) or all(user1_common_scores <= user2_common_scores)

def assign_groups_with_strict_ordering(data, max_group_size=10):
    """
    按照强排序规则分组，并基于强排序分配组内排名 (fairness_id)。
    """
    user_ids = data["user_id"].unique()
    user_scores = {user_id: data[data["user_id"] == user_id][["item_id", "score"]] for user_id in user_ids}

    assigned_users = set()  # 已分配的用户
    groupid_mapping = {user_id: 0 for user_id in user_ids}  # 用户到组的映射
    group_id = 1  # 当前组号

    for user_id in user_ids:
        if user_id in assigned_users:
            continue

        # 当前组初始化
        current_group = [user_id]
        assigned_users.add(user_id)

        for candidate_id in user_ids:
            if candidate_id in assigned_users:
                continue

            # 检查与组内所有用户的强排序关系
            if all(are_strictly_ordered(user_scores[member_id], user_scores[candidate_id]) for member_id in current_group):
                # 添加到当前组
                current_group.append(candidate_id)
                assigned_users.add(candidate_id)

            if len(current_group) >= max_group_size:
                break

        # 分配 group_id
        for member_id in current_group:
            groupid_mapping[member_id] = group_id

        group_id += 1  # 增加组号

    return groupid_mapping


# 执行分组逻辑
groupid_mapping = assign_groups_with_strict_ordering(data)

# 添加 group_id 列
data["group_id"] = data["user_id"].map(groupid_mapping)

# 为每个 user_id 分配唯一的 global fairness_id，从 1 开始递增
unique_users = data[["user_id", "group_id"]].drop_duplicates().sort_values(by=["group_id", "user_id"]).reset_index(drop=True)
unique_users["fairness_id"] = range(1, len(unique_users) + 1)

# 将 `fairness_id` 映射回原数据
data = data.merge(unique_users[["user_id", "fairness_id"]], on="user_id", how="left")

# 最后按照 fairness_id 排序
data = data.sort_values(by="fairness_id").reset_index(drop=True)

# 保存结果
output_path = "./train1.csv"
data.to_csv(output_path, index=False)

print(f"分组完成，结果已保存到 {output_path}")


分组完成，结果已保存到 ./train1.csv


In [6]:
import pandas as pd

# 创建数据集
data = pd.read_csv("./train1.csv")

# 按 group_id 统计 user_id 的唯一个数
group_user_count = data.groupby('group_id')['user_id'].nunique()

# 找出 user_id 个数为 0 的 group_id
empty_groups = group_user_count[group_user_count == 1].index

# 将空组的 group_id 标记为 0
data['group_id'] = data['group_id'].apply(lambda x: 0 if x in empty_groups else x)

# 保存结果
data.to_csv("./train2.csv", index=False)


In [7]:
import pandas as pd

# 示例数据
data = pd.read_csv("./train2.csv")

# 按 group_id 聚合出每个组的 user_id 列表
group_user_mapping = data.groupby("group_id")["user_id"].unique().to_dict()

# 增加一列 get_group，内容为同一组的 user_id 列表
data["get_group"] = data["group_id"].map(group_user_mapping)

# 对 group_id=0 的行，将 get_group 列设置为空列表 []
data.loc[data["group_id"] == 0, "get_group"] = data[data["group_id"] == 0].apply(lambda _: [5000], axis=1)

# 保存到 CSV 文件
data.to_csv("./train3.csv", index=False)

print("数据已保存到 output_with_group.csv 文件中！")


数据已保存到 output_with_group.csv 文件中！


In [8]:
import csv

# 读取CSV文件并计算每个组的最小fairness_id和组内人数
group_info = {}

with open('./train3.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        group_id = row['group_id']
        fairness_id = int(row['fairness_id'])
        group = row['get_group'].strip('[]').split()
        group_size = len(group)
        
        if group_id not in group_info:
            group_info[group_id] = {'min_fairness_id': fairness_id, 'group_size': group_size}
        else:
            group_info[group_id]['min_fairness_id'] = min(group_info[group_id]['min_fairness_id'], fairness_id)

# 重新读取CSV文件并添加新列
with open('./train3.csv', 'r') as infile, open('./train4.csv', 'w', newline='') as outfile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames + ['groupindex', 'group_size']
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for row in reader:
        group_id = row['group_id']
        row['groupindex'] = group_info[group_id]['min_fairness_id']
        row['group_size'] = group_info[group_id]['group_size']
        writer.writerow(row)

In [9]:
import csv
import random

# 读取CSV文件并按组存储数据
grouped_data = {}

with open('./train4.csv', 'r') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        group_id = row['group_id']
        if group_id not in grouped_data:
            grouped_data[group_id] = []
        grouped_data[group_id].append(row)

# 对每个组内的数据进行打乱
for group_id in grouped_data:
    random.shuffle(grouped_data[group_id])

# 将打乱后的数据写入新的CSV文件
with open('./train5.csv', 'w', newline='') as outfile:
    fieldnames = reader.fieldnames
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for group_id in grouped_data:
        for row in grouped_data[group_id]:
            writer.writerow(row)

In [11]:
import pandas as pd
import ast

# 读取数据集
df = pd.read_csv('./train5.csv')
item_df = pd.read_csv('./item.csv')  # 包含 'item_id' 和 'knowledge_code' 列

# 创建 item_id 到 knowledge_code 的映射
item_to_knowledge = item_df.set_index('item_id')['knowledge_code'].to_dict()

# 过滤掉 group_id 为 0 的行
df = df[df['group_id'] != 0]

# 初始化一个字典，用于存储每个 group_id 的知识点列表
group_knowledge = {}

# 按 group_id 分组
for group_id, group in df.groupby('group_id'):
    # 1. 找出 group 内重复的 item_id（公共题目）
    item_ids = group['item_id']
    duplicated_items = item_ids[item_ids.duplicated()].unique()
    
    # 2. 根据映射关系获取知识点
    knowledge_points = set()
    for item_id in duplicated_items:
        if item_id in item_to_knowledge:
            knowledge_codes = item_to_knowledge[item_id]
            # 如果映射的知识点是一个列表，合并所有元素到 set 中去重
            if isinstance(knowledge_codes, list):
                knowledge_points.update(knowledge_codes)
            else:
                knowledge_points.add(knowledge_codes)
    
    # 3. 将知识点列表中的字符串转为真实的列表对象，并进行去重
    knowledge_points_cleaned = set()
    for kp in knowledge_points:
        # 使用 ast.literal_eval 安全地将字符串列表转为真实的 Python 列表
        try:
            kp_list = ast.literal_eval(kp)
            if isinstance(kp_list, list):  # 确保转换为列表
                knowledge_points_cleaned.update(kp_list)
        except (ValueError, SyntaxError):
            pass  # 如果转换失败，跳过该项
    
    # 4. 对去重后的知识点列表进行排序
    group_knowledge[group_id] = sorted(list(knowledge_points_cleaned))

# 将计算好的知识点列表映射回原数据集
df['common_knowledge'] = df['group_id'].map(group_knowledge)

# 保存结果
df.to_csv('./trainfinial.csv', index=False)

print("处理完成，结果已保存到 ./trainfinial.csv")


处理完成，结果已保存到 ./trainfinial.csv


test集合修正


In [12]:
import pandas as pd

# 加载数据集
mapping_data = pd.read_csv("./trainfinial.csv")
test_data = pd.read_csv("./test.csv")

# 构建 user_id 到 fairness_id 的映射
user_to_fairness = dict(zip(mapping_data["user_id"], mapping_data["fairness_id"]))

# 映射 test 数据集中的 fairness_id
test_data["fairness_id"] = test_data["user_id"].map(user_to_fairness)

# 检查映射结果
#print(test_data)

# 保存到文件
test_data.to_csv("./test_finial.csv", index=False)


valid集合修正


In [None]:
import pandas as pd

# 加载数据集
mapping_data = pd.read_csv("./trainfinial.csv")
test_data = pd.read_csv("./valid.csv")

# 构建 user_id 到 fairness_id 的映射
user_to_fairness = dict(zip(mapping_data["user_id"], mapping_data["fairness_id"]))

# 映射 test 数据集中的 fairness_id
test_data["fairness_id"] = test_data["user_id"].map(user_to_fairness)

# 检查映射结果
#print(test_data)

# 保存到文件
test_data.to_csv("./valid_finial.csv", index=False)

运行下面一段，以修复test集合和valid集合



In [None]:
import pandas as pd

# 读取数据集
train_data = pd.read_csv("trainfinial.csv")
test_data = pd.read_csv("test_finial.csv")

# 提取 user_id 和 fairness_id 的映射
train_mapping = train_data[['user_id', 'fairness_id']].drop_duplicates()
test_mapping = test_data[['user_id', 'fairness_id']].drop_duplicates()

# 合并映射，查找不一致的条目
merged_mapping = pd.merge(
    test_mapping, train_mapping, on='user_id', suffixes=('_test', '_train'), how='left'
)

# 找到与 train 中不一致的条目
inconsistent_user_ids = merged_mapping[
    (merged_mapping['fairness_id_train'].isna()) |  # 不在 train 中
    (merged_mapping['fairness_id_test'] != merged_mapping['fairness_id_train'])  # 映射不同
]['user_id']

# 从 test 集合中删除不一致的条目
filtered_test_data = test_data[~test_data['user_id'].isin(inconsistent_user_ids)]

# 保存过滤后的数据集
filtered_test_data.to_csv("test_finial.csv", index=False)
print(f"Filtered test data saved to 'test_finial_filtered.csv'")
