In [None]:
import json
from sklearn.metrics import f1_score, accuracy_score
from transformers import AutoModelForCausalLM, AutoTokenizer
import random
import time
import torch
from tqdm import tqdm 
import re
from rouge_score import rouge_scorer

def read_data(file_path):
    print(f"Reading data from {file_path}...")
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    inputs = [item['input'] for item in data]
    references = [item['reference'] for item in data]
    outputs = [item['output'] for item in data] 
    labels = [item['label'] for item in data]
    
    return inputs, references, outputs, labels

def main(file_path1, file_path2, threshold, model_id, hf_token, device):

    inputs1, references1, outputs1, labels1 = read_data(file_path1)
    inputs2, references2, outputs2, labels2 = read_data(file_path2)

    merged_inputs = inputs1 + inputs2
    merged_references = references1 + references2
    merged_outputs = outputs1 + outputs2
    merged_labels = labels1 + labels2

    merged_data = list(zip(merged_inputs, merged_references, merged_outputs, merged_labels))
    random.shuffle(merged_data)

    inputs, references, outputs, truelabels = zip(*merged_data)

    total_time = 0  
    num_samples = len(inputs)  
    
    generate_labels = []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)  

    for input_text, reference, output, truelabel in tqdm(zip(inputs, references, outputs, truelabels), total=len(inputs), desc="Processing data"):
        start_time_2 = time.time()

        reference = scorer._tokenizer.tokenize(reference)[:50]
        output = scorer._tokenizer.tokenize(output)[:50]
        
        rouge_l_dict = rouge_scorer._score_lcs(reference, output)
        rouge_l = rouge_l_dict.fmeasure

        if rouge_l > threshold:
            final_label_int = 0  
        else:
            final_label_int = 1 

        generate_labels.append(final_label_int)

        end_time_2 = time.time()
        print(f"Processing time for this step: {end_time_2 - start_time_2:.4f} seconds")
        total_time = total_time + (end_time_2 - start_time_2)

    f1 = f1_score(truelabels, generate_labels, average='macro')
    acc = accuracy_score(truelabels, generate_labels)

    avg_time = total_time / num_samples
    print(f"Average processing time per sample: {avg_time:.4f} seconds")
    
    return f1, acc, avg_time

threshold = 0.22222222222222224

file_path1 = './extra_10.infringement.json'
file_path2 = './extra_10.non_infringement.json'

model_id = "mistralai/Mistral-7B-Instruct-v0.1"  
hf_token = ""  

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

f1, acc, avg_time = main(file_path1, file_path2, threshold, model_id, hf_token, device)
print(f"F1 Score: {f1}")
print(f"Accuracy: {acc}")
print(f"Average Processing Time per Sample: {avg_time:.4f} seconds")
