In [1]:
import json
from sklearn.metrics import f1_score, accuracy_score
from transformers import AutoModelForCausalLM, AutoTokenizer
import random
import time
import torch
from tqdm import tqdm
import re
from rouge_score import rouge_scorer
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"

def load_large_model(model_name, hf_token):
    """Load a large model across multiple GPUs"""
    print("Loading model across multiple GPUs...")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="balanced",  # Automatically distribute across multiple GPUs
        offload_folder="offload",  # Offload to disk if memory is insufficient
        offload_state_dict=True,
        output_hidden_states=True,
        use_auth_token=hf_token
    )
    print("Model loaded successfully.")
    return model

def read_data(file_path):
    """Read data from file and return inputs, references, outputs, and labels."""
    print(f"Reading data from {file_path}...")
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    inputs = [item['input'] for item in data]
    references = [item['reference'] for item in data]
    outputs = [item['output'] for item in data]
    labels = [item['label'] for item in data]
    return inputs, references, outputs, labels

def main(file_path1, file_path2, threshold, model_id, hf_token):
    """Main program to perform label prediction and evaluation."""
    # Load tokenizer and large model with multi-GPU support
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_token)
    tokenizer.pad_token = tokenizer.eos_token
    model = load_large_model(model_id, hf_token)

    # Read data from files
    inputs1, references1, outputs1, labels1 = read_data(file_path1)
    inputs2, references2, outputs2, labels2 = read_data(file_path2)

    merged_inputs = inputs1 + inputs2
    merged_references = references1 + references2
    merged_outputs = outputs1 + outputs2
    merged_labels = labels1 + labels2

    merged_data = list(zip(merged_inputs, merged_references, merged_outputs, merged_labels))
    random.shuffle(merged_data)
    inputs, references, outputs, truelabels = zip(*merged_data)

    total_time = 0
    num_samples = len(inputs)
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    generate_labels = []

    for input_text, reference, output, truelabel in tqdm(zip(inputs, references, outputs, truelabels), total=len(inputs), desc="Processing data"):
        start_time = time.time()

        messages = [{
            "role": "user",
            "content": f"""
            Please only output 'label: 0' or 'label: 1', nothing else. Do not provide intermediate steps or explanations.
            Input text: '{input_text}' (Context for the next sentence)
            """
        }]

        formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=True, truncation=True).to(next(model.parameters()).device)

        generated_output = model.generate(
            input_ids=inputs["input_ids"],
            max_new_tokens=10,
            num_return_sequences=1,
            repetition_penalty=1.1,
            temperature=0.0,
            top_p=1.0,
            do_sample=False
        )

        predicted_labels = tokenizer.decode(generated_output[0], skip_special_tokens=True).strip().split(":")[-1].strip()
        final_label_int = int(predicted_labels) if predicted_labels in ['0', '1'] else random.choice([0, 1])
        generate_labels.append(final_label_int)

        total_time += time.time() - start_time

    f1 = f1_score(truelabels, generate_labels, average='macro')
    acc = accuracy_score(truelabels, generate_labels)
    avg_time = total_time / num_samples

    print(f"F1 Score: {f1}")
    print(f"Accuracy: {acc}")
    print(f"Average Processing Time per Sample: {avg_time:.4f} seconds")
    return f1, acc, avg_time

# Parameters
threshold = 0.22222222222222224
file_path1 = '/raid/data/guangwei/copyright_newVersion/test_division/literal.non_infringement.json'
file_path2 = '/raid/data/guangwei/copyright_newVersion/test_division/literal.infringement.json'
model_id = "meta-llama/Llama-3.1-8B-Instruct"
hf_token = "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"

f1, acc, avg_time = main(file_path1, file_path2, threshold, model_id, hf_token)


  from .autonotebook import tqdm as notebook_tqdm


Loading tokenizer...




Loading model across multiple GPUs...


Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.46s/it]


Model loaded successfully.
Reading data from /raid/data/guangwei/copyright_newVersion/test_division/literal.non_infringement.json...
Reading data from /raid/data/guangwei/copyright_newVersion/test_division/literal.infringement.json...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Processing data:   0%|          | 1/758 [00:01<16:06,  1.28s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing data:   0%|          | 2/758 [00:01<11:40,  1.08it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attentio

F1 Score: 0.43684992570579495
Accuracy: 0.7757255936675461
Average Processing Time per Sample: 0.6597 seconds



