In [1]:
import json
from sklearn.metrics import f1_score, accuracy_score
from transformers import AutoModelForCausalLM, AutoTokenizer
import random
import time
import torch
from tqdm import tqdm  # 导入 tqdm 库
import re
from rouge_score import rouge_scorer

# 读取数据的函数
def read_data(file_path):
    """
    读取数据文件，返回输入、参考输出、outputs 和标签
    """
    print(f"Reading data from {file_path}...")
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    # 提取数据中的输入、参考输出、outputs和标签
    inputs = [item['input'] for item in data]
    references = [item['reference'] for item in data]
    outputs = [item['output'] for item in data]  # 新增 outputs 字段
    labels = [item['label'] for item in data]
    
    return inputs, references, outputs, labels

# 主程序
def main(file_path1, file_path2, threshold, model_id, hf_token, device):
    """
    主程序，读取数据并进行标签预测和评估
    """
    # # 加载模型和分词器
    # print("Loading tokenizer and model...")
    # tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_token)
    # tokenizer.pad_token = tokenizer.eos_token
    # model = AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=hf_token).to(device)
    # print("Tokenizer and model loaded.")

    # 读取数据
    inputs1, references1, outputs1, labels1 = read_data(file_path1)
    inputs2, references2, outputs2, labels2 = read_data(file_path2)

    # 合并数据
    merged_inputs = inputs1 + inputs2
    merged_references = references1 + references2
    merged_outputs = outputs1 + outputs2
    merged_labels = labels1 + labels2

    # 原地打乱数据
    merged_data = list(zip(merged_inputs, merged_references, merged_outputs, merged_labels))
    random.shuffle(merged_data)

    # 解压打乱后的数据
    inputs, references, outputs, truelabels = zip(*merged_data)

    total_time = 0  # 总时间
    num_samples = len(inputs)  # 样本数量
    
    generate_labels = []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)  # 创建 ROUGE 评分器

    # Process data with progress bar using tqdm
    for input_text, reference, output, truelabel in tqdm(zip(inputs, references, outputs, truelabels), total=len(inputs), desc="Processing data"):
        start_time_2 = time.time()

        reference = scorer._tokenizer.tokenize(reference)[:50]
        output = scorer._tokenizer.tokenize(output)[:50]
        # 计算 ROUGE 分数
        
        rouge_l_dict = rouge_scorer._score_lcs(reference, output)
        rouge_l = rouge_l_dict.fmeasure

        # 根据 ROUGE-L F1 分数设置标签
        if rouge_l > threshold:
            final_label_int = 0  # 侵权
        else:
            final_label_int = 1  # 非侵权

        # Append the final label to the list of generated labels
        generate_labels.append(final_label_int)

        # Calculate the processing time for this step
        end_time_2 = time.time()
        print(f"Processing time for this step: {end_time_2 - start_time_2:.4f} seconds")
        total_time = total_time + (end_time_2 - start_time_2)

    # 使用 f1_score 和 accuracy_score 计算评估指标
    f1 = f1_score(truelabels, generate_labels, average='macro')
    acc = accuracy_score(truelabels, generate_labels)

    # 计算平均时间
    avg_time = total_time / num_samples
    print(f"Average processing time per sample: {avg_time:.4f} seconds")
    
    return f1, acc, avg_time

# 设置阈值
threshold = 0.22222222222222224

# 文件路径
file_path1 = './extra_10.infringement.json'
file_path2 = './extra_10.non_infringement.json'

# Hugging Face 模型 ID 和 API 令牌
model_id = "mistralai/Mistral-7B-Instruct-v0.1"  # 替换为你的模型 ID
hf_token = "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"  # 替换为你的 Hugging Face API 令牌

# 使用单个 GPU 或 CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 运行程序
f1, acc, avg_time = main(file_path1, file_path2, threshold, model_id, hf_token, device)
print(f"F1 Score: {f1}")
print(f"Accuracy: {acc}")
print(f"Average Processing Time per Sample: {avg_time:.4f} seconds")


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda:0
Reading data from ./extra_10.infringement.json...
Reading data from ./extra_10.non_infringement.json...


Processing data:  10%|▉         | 45/464 [00:00<00:00, 446.44it/s]

Processing time for this step: 0.0030 seconds
Processing time for this step: 0.0021 seconds
Processing time for this step: 0.0018 seconds
Processing time for this step: 0.0023 seconds
Processing time for this step: 0.0017 seconds
Processing time for this step: 0.0021 seconds
Processing time for this step: 0.0018 seconds
Processing time for this step: 0.0022 seconds
Processing time for this step: 0.0026 seconds
Processing time for this step: 0.0023 seconds
Processing time for this step: 0.0021 seconds
Processing time for this step: 0.0063 seconds
Processing time for this step: 0.0026 seconds
Processing time for this step: 0.0017 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0027 seconds
Processing time for this step: 0.0019 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0027 seconds
Processing time for this step: 0.0017 seconds
Processing time for this step: 0.0

Processing data:  19%|█▉        | 90/464 [00:00<00:00, 431.89it/s]

Processing time for this step: 0.0024 seconds
Processing time for this step: 0.0026 seconds
Processing time for this step: 0.0019 seconds
Processing time for this step: 0.0024 seconds
Processing time for this step: 0.0027 seconds
Processing time for this step: 0.0025 seconds
Processing time for this step: 0.0024 seconds
Processing time for this step: 0.0011 seconds
Processing time for this step: 0.0024 seconds


Processing data:  29%|██▉       | 136/464 [00:00<00:00, 442.60it/s]

Processing time for this step: 0.0027 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0052 seconds
Processing time for this step: 0.0028 seconds
Processing time for this step: 0.0021 seconds
Processing time for this step: 0.0021 seconds
Processing time for this step: 0.0025 seconds
Processing time for this step: 0.0028 seconds
Processing time for this step: 0.0026 seconds
Processing time for this step: 0.0025 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0024 seconds
Processing time for this step: 0.0017 seconds
Processing time for this step: 0.0027 seconds
Processing time for this step: 0.0023 seconds
Processing time for this step: 0.0019 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0017 seconds
Processing time for this step: 0.0018 seconds
Processing time for this step: 0.0018 seconds
Processing time for this step: 0.0022 seconds
Processing time for this step: 0.0

Processing data:  39%|███▉      | 181/464 [00:00<00:00, 437.86it/s]

Processing time for this step: 0.0030 seconds
Processing time for this step: 0.0028 seconds
Processing time for this step: 0.0022 seconds
Processing time for this step: 0.0011 seconds
Processing time for this step: 0.0026 seconds
Processing time for this step: 0.0025 seconds
Processing time for this step: 0.0018 seconds
Processing time for this step: 0.0021 seconds
Processing time for this step: 0.0077 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0022 seconds


Processing data:  48%|████▊     | 225/464 [00:00<00:00, 433.74it/s]

Processing time for this step: 0.0022 seconds
Processing time for this step: 0.0027 seconds
Processing time for this step: 0.0029 seconds
Processing time for this step: 0.0018 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0016 seconds
Processing time for this step: 0.0021 seconds
Processing time for this step: 0.0017 seconds
Processing time for this step: 0.0026 seconds
Processing time for this step: 0.0026 seconds
Processing time for this step: 0.0019 seconds
Processing time for this step: 0.0019 seconds
Processing time for this step: 0.0025 seconds
Processing time for this step: 0.0021 seconds
Processing time for this step: 0.0017 seconds
Processing time for this step: 0.0019 seconds
Processing time for this step: 0.0026 seconds
Processing time for this step: 0.0021 seconds
Processing time for this step: 0.0021 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0027 seconds
Processing time for this step: 0.0

Processing data:  58%|█████▊    | 270/464 [00:00<00:00, 433.35it/s]

Processing time for this step: 0.0023 seconds
Processing time for this step: 0.0028 seconds
Processing time for this step: 0.0025 seconds
Processing time for this step: 0.0024 seconds
Processing time for this step: 0.0026 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0027 seconds


Processing data:  77%|███████▋  | 358/464 [00:00<00:00, 427.65it/s]

Processing time for this step: 0.0028 seconds
Processing time for this step: 0.0025 seconds
Processing time for this step: 0.0023 seconds
Processing time for this step: 0.0023 seconds
Processing time for this step: 0.0019 seconds
Processing time for this step: 0.0025 seconds
Processing time for this step: 0.0021 seconds
Processing time for this step: 0.0022 seconds
Processing time for this step: 0.0023 seconds
Processing time for this step: 0.0015 seconds
Processing time for this step: 0.0016 seconds
Processing time for this step: 0.0021 seconds
Processing time for this step: 0.0024 seconds
Processing time for this step: 0.0019 seconds
Processing time for this step: 0.0023 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0016 seconds
Processing time for this step: 0.0022 seconds
Processing time for this step: 0.0024 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0023 seconds
Processing time for this step: 0.0

Processing data:  96%|█████████▌| 446/464 [00:01<00:00, 431.51it/s]

Processing time for this step: 0.0027 seconds
Processing time for this step: 0.0024 seconds
Processing time for this step: 0.0026 seconds
Processing time for this step: 0.0027 seconds
Processing time for this step: 0.0017 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0023 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0025 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0017 seconds
Processing time for this step: 0.0025 seconds
Processing time for this step: 0.0021 seconds
Processing time for this step: 0.0026 seconds
Processing time for this step: 0.0025 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0019 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0020 seconds
Processing time for this step: 0.0021 seconds
Processing time for this step: 0.0

Processing data: 100%|██████████| 464/464 [00:01<00:00, 429.84it/s]

Average processing time per sample: 0.0023 seconds
F1 Score: 1.0
Accuracy: 1.0
Average Processing Time per Sample: 0.0023 seconds



