In [3]:
import torch
torch.cuda.empty_cache()

my access token if you want to rerun = ZHcfFFsrftNHdCIGXgzyRosnfjZOEGNlYz

In [4]:
from huggingface_hub import login

# Log in with access token
login(token="hf_your_access_token")

In [5]:
!pip install openai transformers datasets rouge_score nltk

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=b7a29eb91b69b3f7cb0d678f8f23be3ff05cd311d58949d7b7b1a818d8caf32d
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [6]:
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from rouge_score import rouge_scorer
import re
import json


In [7]:
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

# Load the dataset in extractive mode
dataset = load_dataset("sobamchan/aclsum", "extractive")["test"]
print(dataset[0].keys())

README.md: 0.00B [00:00, ?B/s]

train.jsonl: 0.00B [00:00, ?B/s]

val.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/50 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

dict_keys(['id', 'source_sentences', 'challenge_sentences', 'approach_sentences', 'outcome_sentences', 'challenge_labels', 'approach_labels', 'outcome_labels'])


- to accomodate for different model, the following function is created. The Qwen model requires as a flag trust_remote_code=True, while Llama does not have this requirement
- Llama needs to be loaded in its instruct version because when the "standard" version is unable to follow instructions properly, meaning that for example the output of the prompts might not be a JSON as requested but it might be a long text format. Therefore, we solve this by loading the Instruct version

In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

def load_model_and_tokenizer(model_name: str):
    if "Qwen" in model_name:
        # Qwen models require trust_remote_code=True
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
        print("Loaded Qwen model and tokenizer.")
    elif "Llama" in model_name:
        # Llama models do not require trust_remote_code
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
        print("Loaded Llama model and tokenizer.")
    else:
        raise ValueError("Unsupported model name. Please use a Qwen or Llama model.")

    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

In [9]:
print(dataset)

Dataset({
    features: ['id', 'source_sentences', 'challenge_sentences', 'approach_sentences', 'outcome_sentences', 'challenge_labels', 'approach_labels', 'outcome_labels'],
    num_rows: 100
})


In [10]:
print(dataset['challenge_labels'][0])

[1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### preprocess into sentence label pair

This step is necessary as it will allow us to match the sentence itself with the label. This means that for every possible label (challenge, approach, outcome) we will store the sentences with their proper label in order to be able to evaluate the outcome accordingly.

This means that every phrase in the source_sentences section of the dataset will contain a label as well as the aspect. So, the first sentence might be assigned label 1 for aspect challenge and label 0 for aspects outcome and approach.

In [11]:
def prepare_aspect_data(example, aspect):
    return [
        {"sentence": sent, "label": lab, "aspect": aspect}
        for sent, lab in zip(example["source_sentences"], example[f"{aspect}_labels"])
    ]

sample = prepare_aspect_data(dataset[0], "challenge")
print(sample[:3])

[{'sentence': 'Handling terminology is an important matter in a translation workflow .', 'label': 1, 'aspect': 'challenge'}, {'sentence': 'However , current Machine Translation ( MT ) systems do not yet propose anything proactive upon tools which assist in managing terminological databases .', 'label': 1, 'aspect': 'challenge'}, {'sentence': 'In this work , we investigate several enhancements to analogical learning and test our implementation on translating medical terms .', 'label': 0, 'aspect': 'challenge'}]


In [12]:
sample_1 = prepare_aspect_data(dataset[0], "outcome")
print(sample_1[:3])

[{'sentence': 'Handling terminology is an important matter in a translation workflow .', 'label': 0, 'aspect': 'outcome'}, {'sentence': 'However , current Machine Translation ( MT ) systems do not yet propose anything proactive upon tools which assist in managing terminological databases .', 'label': 0, 'aspect': 'outcome'}, {'sentence': 'In this work , we investigate several enhancements to analogical learning and test our implementation on translating medical terms .', 'label': 0, 'aspect': 'outcome'}]


In [13]:
sample_2 = prepare_aspect_data(dataset[0], "approach")
print(sample_2[:3])

[{'sentence': 'Handling terminology is an important matter in a translation workflow .', 'label': 0, 'aspect': 'approach'}, {'sentence': 'However , current Machine Translation ( MT ) systems do not yet propose anything proactive upon tools which assist in managing terminological databases .', 'label': 0, 'aspect': 'approach'}, {'sentence': 'In this work , we investigate several enhancements to analogical learning and test our implementation on translating medical terms .', 'label': 1, 'aspect': 'approach'}]


To apply this to the whole dataset we can use the following code snippet

In [14]:
def add_triplets(example):
    aspects = ["challenge", "approach", "outcome"]
    # collect labels for each aspect
    aspect_labels = {a: [lab for _, lab in zip(example["source_sentences"], example[f"{a}_labels"])]
                     for a in aspects}
    # combine into triplets
    triplets = []
    for i, sent in enumerate(example["source_sentences"]):
        triplet = [aspect_labels["challenge"][i],
                   aspect_labels["approach"][i],
                   aspect_labels["outcome"][i]]
        triplets.append(triplet)
    example["triplets"] = triplets
    return example

labeled_dataset = dataset.map(add_triplets)

# quick peek
print(labeled_dataset[0]["source_sentences"][:10])
print(labeled_dataset[0]["triplets"][:10])


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

['Handling terminology is an important matter in a translation workflow .', 'However , current Machine Translation ( MT ) systems do not yet propose anything proactive upon tools which assist in managing terminological databases .', 'In this work , we investigate several enhancements to analogical learning and test our implementation on translating medical terms .', 'We show that the analogical engine works equally well when translating from and into a morphologically rich language , or when dealing with language pairs written in different scripts .', 'Combining it with a phrasebased statistical engine leads to significant improvements .', 'If machine translation is to meet commercial needs , it must offer a sensible approach to translating terms .', 'Currently , MT systems offer at best database management tools which allow a human ( typically a translator , a terminologist or even the vendor of the system ) to specify bilingual terminological entries .', 'More advanced tools are mean

In [15]:
from datasets import load_dataset

# Function: expand one document into list of sentence dicts
def expand_doc(example):
    return {
        "sentences": [
            {
                "sentence": sent,
                "label_ch": int(ch),
                "label_ap": int(ap),
                "label_oc": int(oc),
            }
            for sent, ch, ap, oc in zip(
                example["source_sentences"],
                example["challenge_labels"],
                example["approach_labels"],
                example["outcome_labels"]
            )
        ]
    }

expanded = dataset.map(expand_doc)

print(expanded[1]["sentences"][:3])
print(f'check length: {len(expanded[1]["sentences"])} {len(dataset[1]["source_sentences"])}')


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

[{'label_ap': 0, 'label_ch': 1, 'label_oc': 0, 'sentence': 'Reasoning about implied relationships ( e.g. paraphrastic , common sense , encyclopedic ) between pairs of words is crucial for many cross-sentence inference problems .'}, {'label_ap': 1, 'label_ch': 0, 'label_oc': 0, 'sentence': 'This paper proposes new methods for learning and using embeddings of word pairs that implicitly represent background knowledge about such relationships .'}, {'label_ap': 1, 'label_ch': 0, 'label_oc': 0, 'sentence': 'Our pairwise embeddings are computed as a compositional function on word representations , which is learned by maximizing the pointwise mutual information ( PMI ) with the contexts in which the two words cooccur .'}]
check length: 32 32


## prompting techniques

the vanilla prompt has only the task to select the most important sentences regardless of the aspect

In [16]:
def simple_vanilla_prompt(sentences):
    """
    Generates a simple, general-purpose vanilla prompt for extractive summarization.
    """
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select the most important sentences from the document.

Input:
{input_text}

Rules:
- Select ONLY the most important sentences.
- If no sentences are important, return an empty list.
- Indices are 1-based.
- Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

this second vanilla prompt instead selects the most important phrases based on the aspect, so which are the most important phrases connected to challenge, approach and outcome

In [17]:
# vanilla prompt
# needs to be called three times on the same phrase to understand the three aspects
def vanilla_prompt(sentences, target_label):
    # target_label ∈ {"challenge","approach","outcome"}
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)
    return f'''You are an expert in extractive summarization. Your task is to select sentences that express the "{target_label}" aspect of the document.

Aspect definitions:
- challenge: problem, gap, limitation, unmet need, difficulty/motivation.
- approach: method, model, system, algorithm, dataset design, procedure.
- outcome: results, findings, improvements, metrics, performance, impact.

Input:
{input_text}

Rules:
- Select ONLY sentences that primarily express the "{target_label}" aspect.
- If none match, return an empty list.
- Indices are 1-based.
- Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''


**Least to most** prompting technique implemented both in its aspect-based version and in the simple version. Ask the model to identify the overall purpose of the document before returning either the aspect-based sentences or the overall most important sentences.

In [18]:
def least_to_most_prompt(sentences, target_label):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select sentences that express the "{target_label}" aspect of the document.

Aspect definitions:
- challenge: problem, gap, limitation, unmet need, difficulty/motivation.
- approach: method, model, system, algorithm, dataset design, procedure.
- outcome: results, findings, improvements, metrics, performance, impact.

Input:
{input_text}

First, consider the overall purpose of the document and how each sentence contributes to it.
Then, from that understanding, select ONLY sentences that primarily express the "{target_label}" aspect.
If none match, return an empty list.
Indices are 1-based.
Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

In [19]:
def least_to_most_simple_prompt(sentences):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select the most important sentences from the document.

Input:
{input_text}

First, identify the main topics and key arguments of the document.
Then, select ONLY the sentences that directly relate to those topics.
If no sentences are important, return an empty list.
Indices are 1-based.
Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

**Tool-augmented prompting**. Here the model is instructed to use a "tool" or internal function to aid its reasoning. The tool is not a real external program but a conceptual instruction within the prompt itself.



In [20]:
def tool_augmented_prompt(sentences, target_label):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select sentences that express the "{target_label}" aspect of the document.

Aspect definitions:
- challenge: problem, gap, limitation, unmet need, difficulty/motivation.
- approach: method, model, system, algorithm, dataset design, procedure.
- outcome: results, findings, improvements, metrics, performance, impact.

Input:
{input_text}

Instructions:
1. For each sentence, use the internal `check_aspect(sentence, aspect)` tool.
2. The tool's output is 'match' if the sentence primarily describes the "{target_label}" aspect, otherwise it is 'no_match'.
3. List the sentences that result in a 'match'.
4. If no sentences match, return an empty list.
5. Indices are 1-based.
6. Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

In [21]:
def tool_augmented_simple_prompt(sentences):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select the most important sentences from the document.

Input:
{input_text}

Instructions:
1. For each sentence, use the internal `check_importance(sentence)` tool.
2. The tool's output is 'important' if the sentence is central to the main idea, otherwise it is 'not_important'.
3. List the sentences that result in an 'important' output.
4. If no sentences are important, return an empty list.
5. Indices are 1-based.
6. Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

**scoring-based prompting** instead gives a score to each sentence based on the relevance to the task. Also here we have the aspect-based version and the simple, importance-based one.

In [22]:
def scoring_based_prompt(sentences, target_label):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select sentences that express the "{target_label}" aspect of the document.

Aspect definitions:
- challenge: problem, gap, limitation, unmet need, difficulty/motivation.
- approach: method, model, system, algorithm, dataset design, procedure.
- outcome: results, findings, improvements, metrics, performance, impact.

Input:
{input_text}

Instructions:
1. For each sentence, assign a score from 1 (low relevance) to 5 (high relevance) for how well it expresses the "{target_label}" aspect.
2. Only select sentences with a score of 4 or 5.
3. If no sentences meet the threshold, return an empty list.
4. Indices are 1-based.
5. Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

In [23]:
def scoring_based_simple_prompt(sentences):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select the most important sentences from the document.

Input:
{input_text}

Instructions:
1. For each sentence, assign a score from 1 (low importance) to 5 (high importance) for how central it is to the document's main idea.
2. Only select sentences with a score of 4 or 5.
3. If no sentences meet the threshold, return an empty list.
4. Indices are 1-based.
5. Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

**self-ask prompting** involves the model reasoning through a series of yes-no questions and using the answers to reach a conclustion.

In [24]:
def self_ask_prompt(sentences, target_label):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select sentences that express the "{target_label}" aspect of the document.

Aspect definitions:
- challenge: problem, gap, limitation, unmet need, difficulty/motivation.
- approach: method, model, system, algorithm, dataset design, procedure.
- outcome: results, findings, improvements, metrics, performance, impact.

Input:
{input_text}

Instructions:
1. Reason step-by-step. For each sentence, ask the question: "Does this sentence primarily express the "{target_label}" aspect?"
2. Answer the question with "Yes" or "No".
3. Compile a list of all sentences for which the answer was "Yes".
4. If no sentences meet the criteria, return an empty list.
5. Indices are 1-based.
6. Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

In [25]:
def self_ask_simple_prompt(sentences):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select the most important sentences from the document.

Input:
{input_text}

Instructions:
1. Reason step-by-step. First, ask the question: "What is the main idea of this document?"
2. Then, for each sentence, ask: "Does this sentence support the main idea?"
3. Compile a list of all sentences for which the answer was "Yes".
4. If no sentences are important, return an empty list.
5. Indices are 1-based.
6. Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

- Transform gold standard to match output for each label:

  so for example if the challenge_labels looks like this:

  `[1,0,0,1,1,0]`

  it will become

  `[1, 4, 5]`



In [26]:
# Aspects to analyze
ASPECTS = ["challenge", "approach", "outcome"]

def labels_to_indices(labels):
    """0/1 list -> 1-based indices of 1s"""
    return [i+1 for i, v in enumerate(labels) if int(v) == 1]

def gold_for_doc(example):
    """
    Build gold indices per aspect for ONE document.
    Returns: {"challenge":[...], "approach":[...], "outcome":[...]}
    """
    return {
        "challenge": labels_to_indices(example["challenge_labels"]),
        "approach":  labels_to_indices(example["approach_labels"]),
        "outcome":   labels_to_indices(example["outcome_labels"]),
    }

def gold_for_dataset(ds):
    """
    Build gold indices per aspect for ALL docs.
    Returns a list aligned with ds, where item i is gold_for_doc(ds[i])
    """
    return [gold_for_doc(ex) for ex in ds]

gold_all = gold_for_dataset(dataset)
print(gold_all[0])


{'challenge': [1, 2, 7, 11], 'approach': [3, 15, 19, 26, 27], 'outcome': [4, 5, 20, 21, 30, 31]}


-

In [27]:
import json, re
from transformers import AutoTokenizer, pipeline

In [28]:
import json, re

def safe_extract_json_strict(text: str):
    text = text.strip()

    try:
        js = json.loads(text)
        if isinstance(js, dict) and "selected_sentences" in js:
            return js
        if isinstance(js, list):
            return {"selected_sentences": js}
    except Exception:
        pass

    text_clean = re.sub(r"^```[\w-]*\s*\n", "", text, flags=re.S)
    text_clean = re.sub(r"\n```$", "", text_clean, flags=re.S).strip()

    objs = re.findall(r"\{[\s\S]*?\}", text_clean)
    for s in reversed(objs):
        try:
            js = json.loads(s)
            if isinstance(js, dict) and "selected_sentences" in js:
                return js
        except Exception:
            continue

    m = re.search(r"(?m)^\s*\[(?:\s*\d+\s*(?:,\s*\d+\s*)*)?\]\s*$", text_clean)
    if m:
        try:
            arr = json.loads(m.group(0))
            return {"selected_sentences": arr}
        except Exception:
            pass

    return {}


- The qwen model is pretrained on chat-like inputs, and therefore the input we have needs to be translated as a chat as well, and needs a system role

- llama does not use this in the same way as Qwen and the system instruction is typically baked into the user prompt or handled differently

In [29]:
def predict_indices_for_aspect_qwen(model, tokenizer, prompt_technique, sentences, target_label, max_new_tokens=256, show_raw=False):
    # Build prompt and format as chat text
    user_prompt = prompt_technique(sentences, target_label)
    chat_text = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": "You are an expert in extractive summarization."},
            {"role": "user", "content": user_prompt}
        ],
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    print(f'lenght= {len(tokenizer.encode(chat_text))}')

    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Use the model's generate method directly for robust generation
    inputs = tokenizer(chat_text, return_tensors='pt', padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,  # This is the only generation flag you need
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )

    # Decode the generated tokens to a string
    out = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if show_raw:
        print(f"\n[RAW OUTPUT for {target_label}]\n{out}\n")

    # Use the robust JSON parser on the decoded string
    js = safe_extract_json_strict(out)
    idxs = js.get("selected_sentences", [])

    # Sanitize indices
    n = len(sentences)
    cleaned = []
    for v in idxs:
        if isinstance(v, (int, float)):
            v = int(v)
            if 1 <= v <= n:
                cleaned.append(v)
    return sorted(set(cleaned))



In [30]:
  # def predict_indices_for_aspect_llama(prompt_technique, sentences, target_label, max_new_tokens=256, show_raw=False):
  #   # Get the user prompt
  #   user_prompt = prompt_technique(sentences, target_label)

  #   chat_text = tokenizer.apply_chat_template(
  #       [{"role": "user", "content": user_prompt}],
  #       chat_template="{% for message in messages %}{% if message['role'] == 'user' %}[INST] {{ message['content'] }} [/INST]{% endif %}{% endfor %}",
  #       tokenize=False,
  #       add_generation_prompt=True,
  #   )

  #   if tokenizer.pad_token_id is None:
  #       tokenizer.pad_token = tokenizer.eos_token

  #   # Use the model's generate method directly
  #   inputs = tokenizer(chat_text, return_tensors='pt', padding=True)
  #   inputs = {k: v.to(model.device) for k, v in inputs.items()}

  #   outputs = model.generate(
  #       **inputs,
  #       max_new_tokens=max_new_tokens,
  #       do_sample=False,
  #       eos_token_id=tokenizer.eos_token_id,
  #       pad_token_id=tokenizer.pad_token_id
  #   )

  #   # Decode the generated tokens
  #   out = tokenizer.decode(outputs[0], skip_special_tokens=True)

  #   if show_raw:
  #       print(f"\n[RAW OUTPUT for {target_label}]\n{out}\n")

  #   # Use the robust JSON parser
  #   js = safe_extract_json_strict(out)
  #   idxs = js.get("selected_sentences", [])

  #   # Sanitize indices
  #   n = len(sentences)
  #   cleaned = []
  #   for v in idxs:
  #       if isinstance(v, (int, float)):
  #           v = int(v)
  #           if 1 <= v <= n:
  #               cleaned.append(v)
  #   return sorted(set(cleaned))

# The Llama 3 chat template with headers
# This is a robust template that works for both single-turn and multi-turn conversations
def predict_indices_for_aspect_llama(model, tokenizer, prompt_technique, sentences, target_label, max_new_tokens=256, show_raw=False):
    # Get the user prompt
    user_prompt = prompt_technique(sentences, target_label)

    chat_text = tokenizer.apply_chat_template(
        [{"role": "user", "content": user_prompt}],
        tokenize=False,
        add_generation_prompt=True,
    )

    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token

    inputs = tokenizer(chat_text, return_tensors='pt', padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )

    # Decode the generated tokens
    out = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if show_raw:
        print(f"\n[RAW OUTPUT for {target_label}]\n{out}\n")

    # Use the robust JSON parser
    js = safe_extract_json_strict(out)
    idxs = js.get("selected_sentences", [])

    # Sanitize indices
    n = len(sentences)
    cleaned = []
    for v in idxs:
        if isinstance(v, (int, float)):
            v = int(v)
            if 1 <= v <= n:
                cleaned.append(v)
    return sorted(set(cleaned))

In [31]:
def calculate_metrics(gold_for_doc, predicted_indices):

    # Convert lists to sets for efficient intersection and difference operations
    gold_set = set(gold_for_doc)
    predicted_set = set(predicted_indices)

    # Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
    true_positives = len(gold_set.intersection(predicted_set))
    false_positives = len(predicted_set.difference(gold_set))
    false_negatives = len(gold_set.difference(predicted_set))

    # Calculate Precision
    if true_positives + false_positives == 0:
        precision = 0.0
    else:
        precision = true_positives / (true_positives + false_positives)

    # Calculate Recall
    if true_positives + false_negatives == 0:
        recall = 0.0
    else:
        recall = true_positives / (true_positives + false_negatives)

    # Calculate F1-score
    if precision + recall == 0:
        f1_score = 0.0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)

    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score
    }

Evaluation loop:

The following function evaluates precision, recall and f1 for each aspect both for a single document and in an aggregate way

In [32]:
def run_evaluation(dataset, model_name, model, tokenizer, prompt_technique, gold_all, aspects, start=0, stop=None):
    if stop is None:
        stop = len(dataset)
    stop = min(stop, len(dataset))

    # Dictionaries to store metrics for macro-averaging
    all_metrics = {aspect: {"precision": [], "recall": [], "f1_score": []} for aspect in aspects}

    for doc_idx in range(start, stop):
        ex = dataset[doc_idx]
        sentences = ex["source_sentences"]
        gold_standard = gold_all[doc_idx] # given that gold_all = gold_for_dataset(dataset)

        print("="*80)
        print(f"Doc {doc_idx} (id={ex.get('id', 'NA')}) | #sentences={len(sentences)}")
        print("-"*80)

        for aspect in aspects:
            # get predicted indices for the current aspect
            if "Qwen" in model_name:
              print("Using Qwen")
              predicted_indices = predict_indices_for_aspect_qwen(
                  model = model,
                  tokenizer = tokenizer,
                  prompt_technique=prompt_technique,
                  sentences=sentences,
                  target_label=aspect,
              )
            elif "Llama" in model_name:
              print("Using Llama")
              predicted_indices = predict_indices_for_aspect_llama(
                  model = model,
                  tokenizer = tokenizer,
                  prompt_technique=prompt_technique,
                  sentences=sentences,
                  target_label=aspect,
              )

            # get gold standard for each aspect
            gold_indices = gold_standard.get(aspect, [])

            # Calculate metrics
            metrics = calculate_metrics(gold_indices, predicted_indices)

            # Print per-document results
            print(f"  -> {aspect.capitalize():9s}")
            print(f"     Predicted: {sorted(predicted_indices)}")
            print(f"     Gold:      {sorted(gold_indices)}")
            print(f"     Metrics: P={metrics['precision']:.2f}, R={metrics['recall']:.2f}, F1={metrics['f1_score']:.2f}")
            print()

            # Store metrics for aggregation
            all_metrics[aspect]["precision"].append(metrics['precision'])
            all_metrics[aspect]["recall"].append(metrics['recall'])
            all_metrics[aspect]["f1_score"].append(metrics['f1_score'])

    # Clculate and print the final aggregate scores
    print("\n" + "="*80)
    print("--- Final Aggregate Metrics ---")
    print("="*80)

    for aspect in aspects:
        avg_p = sum(all_metrics[aspect]["precision"]) / len(all_metrics[aspect]["precision"]) if all_metrics[aspect]["precision"] else 0
        avg_r = sum(all_metrics[aspect]["recall"]) / len(all_metrics[aspect]["recall"]) if all_metrics[aspect]["recall"] else 0
        avg_f1 = sum(all_metrics[aspect]["f1_score"]) / len(all_metrics[aspect]["f1_score"]) if all_metrics[aspect]["f1_score"] else 0

        print(f"  -> {aspect.capitalize()} Average:")
        print(f"     Precision: {avg_p:.2f}")
        print(f"     Recall:    {avg_r:.2f}")
        print(f"     F1-score:  {avg_f1:.2f}")
        print()

    print("="*80)

## Llama Results

In [33]:
model_name = "meta-llama/Llama-3.2-3B-Instruct" # Or "meta-llama/Llama-3.2-3B-Instruct" || Qwen/Qwen3-4B

model, tokenizer = load_model_and_tokenizer(model_name)

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Loaded Llama model and tokenizer.


### Predictions for the first three documents for the **vanilla prompt**

In [34]:
run_evaluation(dataset, model_name, model, tokenizer, vanilla_prompt, gold_all, ASPECTS, stop=3)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Doc 0 (id=E09-1056) | #sentences=44
--------------------------------------------------------------------------------
Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 6, 7, 11, 14, 17, 19, 22, 23, 27, 29, 31, 32, 33, 36, 37, 38, 39, 40, 41, 42, 43, 44]
     Gold:      [1, 2, 7, 11]
     Metrics: P=0.13, R=0.75, F1=0.22

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: []
     Gold:      [3, 15, 19, 26, 27]
     Metrics: P=0.00, R=0.00, F1=0.00

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: []
     Gold:      [4, 5, 20, 21, 30, 31]
     Metrics: P=0.00, R=0.00, F1=0.00

Doc 1 (id=N19-1362) | #sentences=32
--------------------------------------------------------------------------------
Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [9, 19, 28]
     Gold:      [1, 7, 9]
     Metrics: P=0.33, R=0.33, F1=0.33

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
     Gold:      [2, 3, 10, 11, 14, 15, 24, 25]
     Metrics: P=0.29, R=1.00, F1=0.44

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [5, 6, 21, 22, 26, 27]
     Gold:      [5, 6, 13, 20, 21, 22, 26]
     Metrics: P=0.83, R=0.71, F1=0.77

Doc 2 (id=P01-1040) | #sentences=17
--------------------------------------------------------------------------------
Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 6, 15, 16]
     Gold:      [1, 5, 6]
     Metrics: P=0.50, R=0.67, F1=0.57

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 7, 11, 13, 14, 17]
     Gold:      [2, 7, 14]
     Metrics: P=0.50, R=1.00, F1=0.67

Using Llama
  -> Outcome  
     Predicted: [3, 4, 9, 10, 11, 12, 13, 14, 15, 17]
     Gold:      [3, 4, 12, 13]
     Metrics: P=0.40, R=1.00, F1=0.57


--- Final Aggregate Metrics ---
  -> Challenge Average:
     Precision: 0.32
     Recall:    0.58
     F1-score:  0.38

  -> Approach Average:
     Precision: 0.26
     Recall:    0.67
     F1-score:  0.37

  -> Outcome Average:
     Precision: 0.41
     Recall:    0.57
     F1-score:  0.45



### Predictions for the first three documents for the **least to most prompt**

In [35]:
# least_to_most_prompt
run_evaluation(dataset, model_name, model, tokenizer, least_to_most_prompt, gold_all, ASPECTS, stop=3)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Doc 0 (id=E09-1056) | #sentences=44
--------------------------------------------------------------------------------
Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 6, 7, 11, 14, 17, 19, 22, 23, 27, 29, 37, 39, 41, 42, 43]
     Gold:      [1, 2, 7, 11]
     Metrics: P=0.19, R=0.75, F1=0.30

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [3, 9, 12, 16, 17, 18, 19, 22, 23, 24, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]
     Gold:      [3, 15, 19, 26, 27]
     Metrics: P=0.11, R=0.60, F1=0.18

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [4, 5, 6, 9, 10, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]
     Gold:      [4, 5, 20, 21, 30, 31]
     Metrics: P=0.16, R=1.00, F1=0.28

Doc 1 (id=N19-1362) | #sentences=32
--------------------------------------------------------------------------------
Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 9, 11, 19, 23, 27, 28]
     Gold:      [1, 7, 9]
     Metrics: P=0.29, R=0.67, F1=0.40

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
     Gold:      [2, 3, 10, 11, 14, 15, 24, 25]
     Metrics: P=0.26, R=1.00, F1=0.41

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [5, 6, 21, 22, 26, 27]
     Gold:      [5, 6, 13, 20, 21, 22, 26]
     Metrics: P=0.83, R=0.71, F1=0.77

Doc 2 (id=P01-1040) | #sentences=17
--------------------------------------------------------------------------------
Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: []
     Gold:      [1, 5, 6]
     Metrics: P=0.00, R=0.00, F1=0.00

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: []
     Gold:      [2, 7, 14]
     Metrics: P=0.00, R=0.00, F1=0.00

Using Llama
  -> Outcome  
     Predicted: []
     Gold:      [3, 4, 12, 13]
     Metrics: P=0.00, R=0.00, F1=0.00


--- Final Aggregate Metrics ---
  -> Challenge Average:
     Precision: 0.16
     Recall:    0.47
     F1-score:  0.23

  -> Approach Average:
     Precision: 0.12
     Recall:    0.53
     F1-score:  0.20

  -> Outcome Average:
     Precision: 0.33
     Recall:    0.57
     F1-score:  0.35



### Predictions for the first three documents for the **tool augmented prompt**

In [36]:
# tool_augmented_prompt
run_evaluation(dataset, model_name, model, tokenizer, tool_augmented_prompt, gold_all, ASPECTS, stop=3)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Doc 0 (id=E09-1056) | #sentences=44
--------------------------------------------------------------------------------
Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: []
     Gold:      [1, 2, 7, 11]
     Metrics: P=0.00, R=0.00, F1=0.00

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]
     Gold:      [3, 15, 19, 26, 27]
     Metrics: P=0.11, R=1.00, F1=0.20

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: []
     Gold:      [4, 5, 20, 21, 30, 31]
     Metrics: P=0.00, R=0.00, F1=0.00

Doc 1 (id=N19-1362) | #sentences=32
--------------------------------------------------------------------------------
Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 9, 12, 19, 23, 27, 28, 29, 31, 32]
     Gold:      [1, 7, 9]
     Metrics: P=0.20, R=0.67, F1=0.31

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: []
     Gold:      [2, 3, 10, 11, 14, 15, 24, 25]
     Metrics: P=0.00, R=0.00, F1=0.00

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [5, 6, 21, 22, 26]
     Gold:      [5, 6, 13, 20, 21, 22, 26]
     Metrics: P=1.00, R=0.71, F1=0.83

Doc 2 (id=P01-1040) | #sentences=17
--------------------------------------------------------------------------------
Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: []
     Gold:      [1, 5, 6]
     Metrics: P=0.00, R=0.00, F1=0.00

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 7, 9, 11, 12, 13, 14, 15, 17]
     Gold:      [2, 7, 14]
     Metrics: P=0.33, R=1.00, F1=0.50

Using Llama
  -> Outcome  
     Predicted: [3, 4, 9, 10, 11, 12, 13, 14, 15, 16, 17]
     Gold:      [3, 4, 12, 13]
     Metrics: P=0.36, R=1.00, F1=0.53


--- Final Aggregate Metrics ---
  -> Challenge Average:
     Precision: 0.07
     Recall:    0.22
     F1-score:  0.10

  -> Approach Average:
     Precision: 0.15
     Recall:    0.67
     F1-score:  0.23

  -> Outcome Average:
     Precision: 0.45
     Recall:    0.57
     F1-score:  0.46



### Predictions for the first three documents for the **scoring based prompt**

In [37]:
# scoring_based_prompt
run_evaluation(dataset, model_name, model, tokenizer, scoring_based_prompt, gold_all, ASPECTS, stop=3)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Doc 0 (id=E09-1056) | #sentences=44
--------------------------------------------------------------------------------
Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [2, 7, 11, 14, 17, 19, 22, 23, 27, 37, 39, 41, 42, 43]
     Gold:      [1, 2, 7, 11]
     Metrics: P=0.21, R=0.75, F1=0.33

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [3, 9, 12, 16, 17, 19, 21, 22, 23, 24, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]
     Gold:      [3, 15, 19, 26, 27]
     Metrics: P=0.11, R=0.60, F1=0.18

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [3, 4, 5, 6, 9, 10, 12, 14, 16, 17, 18, 20, 21, 22, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44]
     Gold:      [4, 5, 20, 21, 30, 31]
     Metrics: P=0.18, R=1.00, F1=0.31

Doc 1 (id=N19-1362) | #sentences=32
--------------------------------------------------------------------------------
Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 9, 11, 19, 21, 22, 23, 27, 28]
     Gold:      [1, 7, 9]
     Metrics: P=0.22, R=0.67, F1=0.33

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
     Gold:      [2, 3, 10, 11, 14, 15, 24, 25]
     Metrics: P=0.26, R=1.00, F1=0.41

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [5, 6, 11, 15, 16, 17, 20, 21, 22, 24, 26, 27, 31]
     Gold:      [5, 6, 13, 20, 21, 22, 26]
     Metrics: P=0.46, R=0.86, F1=0.60

Doc 2 (id=P01-1040) | #sentences=17
--------------------------------------------------------------------------------
Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 6, 7, 8, 12, 13, 14, 16]
     Gold:      [1, 5, 6]
     Metrics: P=0.25, R=0.67, F1=0.36

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 7, 11, 13, 14]
     Gold:      [2, 7, 14]
     Metrics: P=0.60, R=1.00, F1=0.75

Using Llama
  -> Outcome  
     Predicted: [9, 10, 12, 13, 14, 17]
     Gold:      [3, 4, 12, 13]
     Metrics: P=0.33, R=0.50, F1=0.40


--- Final Aggregate Metrics ---
  -> Challenge Average:
     Precision: 0.23
     Recall:    0.69
     F1-score:  0.34

  -> Approach Average:
     Precision: 0.32
     Recall:    0.87
     F1-score:  0.45

  -> Outcome Average:
     Precision: 0.33
     Recall:    0.79
     F1-score:  0.44



### Predictions for the first three documents for the **self ask prompt**

In [38]:
# self_ask_prompt
run_evaluation(dataset, model_name, model, tokenizer, self_ask_prompt, gold_all, ASPECTS, stop=3)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Doc 0 (id=E09-1056) | #sentences=44
--------------------------------------------------------------------------------
Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 6, 7, 11, 14, 17, 19, 22, 23, 27, 29, 31, 32, 37, 39, 41, 42, 43, 44]
     Gold:      [1, 2, 7, 11]
     Metrics: P=0.16, R=0.75, F1=0.26

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]
     Gold:      [3, 15, 19, 26, 27]
     Metrics: P=0.12, R=1.00, F1=0.21

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [4, 5, 6, 9, 10, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]
     Gold:      [4, 5, 20, 21, 30, 31]
     Metrics: P=0.16, R=1.00, F1=0.28

Doc 1 (id=N19-1362) | #sentences=32
--------------------------------------------------------------------------------
Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 9, 12, 19, 23, 28]
     Gold:      [1, 7, 9]
     Metrics: P=0.33, R=0.67, F1=0.44

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
     Gold:      [2, 3, 10, 11, 14, 15, 24, 25]
     Metrics: P=0.26, R=1.00, F1=0.41

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [5, 6, 21, 22, 26, 27]
     Gold:      [5, 6, 13, 20, 21, 22, 26]
     Metrics: P=0.83, R=0.71, F1=0.77

Doc 2 (id=P01-1040) | #sentences=17
--------------------------------------------------------------------------------
Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 6, 16]
     Gold:      [1, 5, 6]
     Metrics: P=0.67, R=0.67, F1=0.67

Using Llama


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 7, 11, 13, 14, 17]
     Gold:      [2, 7, 14]
     Metrics: P=0.50, R=1.00, F1=0.67

Using Llama
  -> Outcome  
     Predicted: [3, 4, 9, 10, 11, 12, 13, 14, 15, 17]
     Gold:      [3, 4, 12, 13]
     Metrics: P=0.40, R=1.00, F1=0.57


--- Final Aggregate Metrics ---
  -> Challenge Average:
     Precision: 0.39
     Recall:    0.69
     F1-score:  0.46

  -> Approach Average:
     Precision: 0.29
     Recall:    1.00
     F1-score:  0.43

  -> Outcome Average:
     Precision: 0.47
     Recall:    0.90
     F1-score:  0.54



## Qwen results

In [39]:
model_name = "Qwen/Qwen3-4B" # Or "meta-llama/Llama-3.2-3B-Instruct" || Qwen/Qwen3-4B

model, tokenizer = load_model_and_tokenizer(model_name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



Loaded Qwen model and tokenizer.


### results for **vanilla prompt**

In [40]:
run_evaluation(dataset, model_name, model, tokenizer, vanilla_prompt, gold_all, ASPECTS, stop=3)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Doc 0 (id=E09-1056) | #sentences=44
--------------------------------------------------------------------------------
Using Qwen
lenght= 1551


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [2, 6, 11, 13, 14, 37]
     Gold:      [1, 2, 7, 11]
     Metrics: P=0.33, R=0.50, F1=0.40

Using Qwen
lenght= 1553


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [3, 9, 19, 26, 27, 28, 29, 30, 32, 35, 36, 37, 40, 41, 42, 43]
     Gold:      [3, 15, 19, 26, 27]
     Metrics: P=0.25, R=0.80, F1=0.38

Using Qwen
lenght= 1551


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [3, 5, 19, 20, 21, 30, 31, 32, 33, 34, 35, 39]
     Gold:      [4, 5, 20, 21, 30, 31]
     Metrics: P=0.42, R=0.83, F1=0.56

Doc 1 (id=N19-1362) | #sentences=32
--------------------------------------------------------------------------------
Using Qwen
lenght= 1594


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 7, 9, 12, 13]
     Gold:      [1, 7, 9]
     Metrics: P=0.60, R=1.00, F1=0.75

Using Qwen
lenght= 1596


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 3, 4, 10, 11, 14, 15, 17, 18, 24, 25, 26]
     Gold:      [2, 3, 10, 11, 14, 15, 24, 25]
     Metrics: P=0.67, R=1.00, F1=0.80

Using Qwen
lenght= 1594


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [5, 6, 20, 21, 22, 26]
     Gold:      [5, 6, 13, 20, 21, 22, 26]
     Metrics: P=1.00, R=0.86, F1=0.92

Doc 2 (id=P01-1040) | #sentences=17
--------------------------------------------------------------------------------
Using Qwen
lenght= 982


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 5, 6, 16]
     Gold:      [1, 5, 6]
     Metrics: P=0.75, R=1.00, F1=0.86

Using Qwen
lenght= 984


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 7, 8, 12, 14]
     Gold:      [2, 7, 14]
     Metrics: P=0.60, R=1.00, F1=0.75

Using Qwen
lenght= 982
  -> Outcome  
     Predicted: [9, 12, 17]
     Gold:      [3, 4, 12, 13]
     Metrics: P=0.33, R=0.25, F1=0.29


--- Final Aggregate Metrics ---
  -> Challenge Average:
     Precision: 0.56
     Recall:    0.83
     F1-score:  0.67

  -> Approach Average:
     Precision: 0.51
     Recall:    0.93
     F1-score:  0.64

  -> Outcome Average:
     Precision: 0.58
     Recall:    0.65
     F1-score:  0.59



### results for **least to most prompt**

In [41]:
run_evaluation(dataset, model_name, model, tokenizer, least_to_most_prompt, gold_all, ASPECTS, stop=3)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Doc 0 (id=E09-1056) | #sentences=44
--------------------------------------------------------------------------------
Using Qwen
lenght= 1568


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [2, 6, 11, 13, 14, 37]
     Gold:      [1, 2, 7, 11]
     Metrics: P=0.33, R=0.50, F1=0.40

Using Qwen
lenght= 1570


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [3, 9, 19, 26, 27, 28, 29, 30, 32, 35]
     Gold:      [3, 15, 19, 26, 27]
     Metrics: P=0.40, R=0.80, F1=0.53

Using Qwen
lenght= 1568


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [3, 5, 19, 20, 21, 30, 31, 32, 33, 34, 35, 39]
     Gold:      [4, 5, 20, 21, 30, 31]
     Metrics: P=0.42, R=0.83, F1=0.56

Doc 1 (id=N19-1362) | #sentences=32
--------------------------------------------------------------------------------
Using Qwen
lenght= 1611


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 7, 9, 12, 13]
     Gold:      [1, 7, 9]
     Metrics: P=0.60, R=1.00, F1=0.75

Using Qwen
lenght= 1613


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 3, 4, 10, 11, 14, 15, 17, 18, 24, 25, 26]
     Gold:      [2, 3, 10, 11, 14, 15, 24, 25]
     Metrics: P=0.67, R=1.00, F1=0.80

Using Qwen
lenght= 1611


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [5, 6, 20, 21, 22, 26]
     Gold:      [5, 6, 13, 20, 21, 22, 26]
     Metrics: P=1.00, R=0.86, F1=0.92

Doc 2 (id=P01-1040) | #sentences=17
--------------------------------------------------------------------------------
Using Qwen
lenght= 999


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 5, 6, 16]
     Gold:      [1, 5, 6]
     Metrics: P=0.75, R=1.00, F1=0.86

Using Qwen
lenght= 1001


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 7, 8, 12, 14]
     Gold:      [2, 7, 14]
     Metrics: P=0.60, R=1.00, F1=0.75

Using Qwen
lenght= 999
  -> Outcome  
     Predicted: [9, 12, 13, 17]
     Gold:      [3, 4, 12, 13]
     Metrics: P=0.50, R=0.50, F1=0.50


--- Final Aggregate Metrics ---
  -> Challenge Average:
     Precision: 0.56
     Recall:    0.83
     F1-score:  0.67

  -> Approach Average:
     Precision: 0.56
     Recall:    0.93
     F1-score:  0.69

  -> Outcome Average:
     Precision: 0.64
     Recall:    0.73
     F1-score:  0.66



### results for **tool augmented prompt**

In [42]:
run_evaluation(dataset, model_name, model, tokenizer, tool_augmented_prompt, gold_all, ASPECTS, stop=3)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Doc 0 (id=E09-1056) | #sentences=44
--------------------------------------------------------------------------------
Using Qwen
lenght= 1600


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [2, 6, 11, 13, 14, 37]
     Gold:      [1, 2, 7, 11]
     Metrics: P=0.33, R=0.50, F1=0.40

Using Qwen
lenght= 1602


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [3, 9, 19, 26, 27, 28, 29, 30, 32, 35, 36, 37, 40, 41, 42, 43]
     Gold:      [3, 15, 19, 26, 27]
     Metrics: P=0.25, R=0.80, F1=0.38

Using Qwen
lenght= 1600


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [3, 5, 30, 31, 32, 33, 34, 35, 39]
     Gold:      [4, 5, 20, 21, 30, 31]
     Metrics: P=0.33, R=0.50, F1=0.40

Doc 1 (id=N19-1362) | #sentences=32
--------------------------------------------------------------------------------
Using Qwen
lenght= 1643


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 7, 9, 12, 13]
     Gold:      [1, 7, 9]
     Metrics: P=0.60, R=1.00, F1=0.75

Using Qwen
lenght= 1645


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 3, 4, 10, 11, 14, 15, 17, 18, 24, 25, 26]
     Gold:      [2, 3, 10, 11, 14, 15, 24, 25]
     Metrics: P=0.67, R=1.00, F1=0.80

Using Qwen
lenght= 1643


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [5, 6, 20, 21, 22, 26]
     Gold:      [5, 6, 13, 20, 21, 22, 26]
     Metrics: P=1.00, R=0.86, F1=0.92

Doc 2 (id=P01-1040) | #sentences=17
--------------------------------------------------------------------------------
Using Qwen
lenght= 1031


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 5, 6]
     Gold:      [1, 5, 6]
     Metrics: P=1.00, R=1.00, F1=1.00

Using Qwen
lenght= 1033


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 7, 8, 12, 14]
     Gold:      [2, 7, 14]
     Metrics: P=0.60, R=1.00, F1=0.75

Using Qwen
lenght= 1031
  -> Outcome  
     Predicted: [9, 12, 17]
     Gold:      [3, 4, 12, 13]
     Metrics: P=0.33, R=0.25, F1=0.29


--- Final Aggregate Metrics ---
  -> Challenge Average:
     Precision: 0.64
     Recall:    0.83
     F1-score:  0.72

  -> Approach Average:
     Precision: 0.51
     Recall:    0.93
     F1-score:  0.64

  -> Outcome Average:
     Precision: 0.56
     Recall:    0.54
     F1-score:  0.54



### results for **scoring based prompt**

In [43]:
run_evaluation(dataset, model_name, model, tokenizer, scoring_based_prompt, gold_all, ASPECTS, stop=3)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Doc 0 (id=E09-1056) | #sentences=44
--------------------------------------------------------------------------------
Using Qwen
lenght= 1593


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [2, 6, 11, 13, 14, 37]
     Gold:      [1, 2, 7, 11]
     Metrics: P=0.33, R=0.50, F1=0.40

Using Qwen
lenght= 1595


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [3, 9, 19, 26, 27, 28, 29, 30, 32, 35, 36, 37, 40, 41, 42, 43]
     Gold:      [3, 15, 19, 26, 27]
     Metrics: P=0.25, R=0.80, F1=0.38

Using Qwen
lenght= 1593


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [3, 5, 19, 20, 21, 30, 31, 32, 33, 34, 35, 39]
     Gold:      [4, 5, 20, 21, 30, 31]
     Metrics: P=0.42, R=0.83, F1=0.56

Doc 1 (id=N19-1362) | #sentences=32
--------------------------------------------------------------------------------
Using Qwen
lenght= 1636


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 7, 9, 13, 26]
     Gold:      [1, 7, 9]
     Metrics: P=0.60, R=1.00, F1=0.75

Using Qwen
lenght= 1638


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 3, 4, 10, 11, 14, 15, 17, 18, 24, 25, 26]
     Gold:      [2, 3, 10, 11, 14, 15, 24, 25]
     Metrics: P=0.67, R=1.00, F1=0.80

Using Qwen
lenght= 1636


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [5, 6, 20, 21, 22, 26]
     Gold:      [5, 6, 13, 20, 21, 22, 26]
     Metrics: P=1.00, R=0.86, F1=0.92

Doc 2 (id=P01-1040) | #sentences=17
--------------------------------------------------------------------------------
Using Qwen
lenght= 1024


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 5, 6, 7]
     Gold:      [1, 5, 6]
     Metrics: P=0.75, R=1.00, F1=0.86

Using Qwen
lenght= 1026


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 7, 8, 12, 14]
     Gold:      [2, 7, 14]
     Metrics: P=0.60, R=1.00, F1=0.75

Using Qwen
lenght= 1024
  -> Outcome  
     Predicted: [9, 12, 17]
     Gold:      [3, 4, 12, 13]
     Metrics: P=0.33, R=0.25, F1=0.29


--- Final Aggregate Metrics ---
  -> Challenge Average:
     Precision: 0.56
     Recall:    0.83
     F1-score:  0.67

  -> Approach Average:
     Precision: 0.51
     Recall:    0.93
     F1-score:  0.64

  -> Outcome Average:
     Precision: 0.58
     Recall:    0.65
     F1-score:  0.59



### results for **self ask prompt**

In [44]:
run_evaluation(dataset, model_name, model, tokenizer, self_ask_prompt, gold_all, ASPECTS, stop=3)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Doc 0 (id=E09-1056) | #sentences=44
--------------------------------------------------------------------------------
Using Qwen
lenght= 1600


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [2, 6, 11, 13, 14, 37]
     Gold:      [1, 2, 7, 11]
     Metrics: P=0.33, R=0.50, F1=0.40

Using Qwen
lenght= 1602


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [3, 9, 19, 26, 27, 28, 29, 30, 32, 35, 36, 37, 40, 41, 42, 43]
     Gold:      [3, 15, 19, 26, 27]
     Metrics: P=0.25, R=0.80, F1=0.38

Using Qwen
lenght= 1600


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [3, 5, 30, 31, 32, 33, 34, 35, 39]
     Gold:      [4, 5, 20, 21, 30, 31]
     Metrics: P=0.33, R=0.50, F1=0.40

Doc 1 (id=N19-1362) | #sentences=32
--------------------------------------------------------------------------------
Using Qwen
lenght= 1643


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 7, 9, 12, 13]
     Gold:      [1, 7, 9]
     Metrics: P=0.60, R=1.00, F1=0.75

Using Qwen
lenght= 1645


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 3, 4, 10, 11, 14, 15, 17, 18, 24, 25, 26]
     Gold:      [2, 3, 10, 11, 14, 15, 24, 25]
     Metrics: P=0.67, R=1.00, F1=0.80

Using Qwen
lenght= 1643


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Outcome  
     Predicted: [5, 6, 20, 21, 22, 26]
     Gold:      [5, 6, 13, 20, 21, 22, 26]
     Metrics: P=1.00, R=0.86, F1=0.92

Doc 2 (id=P01-1040) | #sentences=17
--------------------------------------------------------------------------------
Using Qwen
lenght= 1031


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Challenge
     Predicted: [1, 5, 6]
     Gold:      [1, 5, 6]
     Metrics: P=1.00, R=1.00, F1=1.00

Using Qwen
lenght= 1033


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  -> Approach 
     Predicted: [2, 7, 8, 12, 14]
     Gold:      [2, 7, 14]
     Metrics: P=0.60, R=1.00, F1=0.75

Using Qwen
lenght= 1031
  -> Outcome  
     Predicted: [9, 12, 17]
     Gold:      [3, 4, 12, 13]
     Metrics: P=0.33, R=0.25, F1=0.29


--- Final Aggregate Metrics ---
  -> Challenge Average:
     Precision: 0.64
     Recall:    0.83
     F1-score:  0.72

  -> Approach Average:
     Precision: 0.51
     Recall:    0.93
     F1-score:  0.64

  -> Outcome Average:
     Precision: 0.56
     Recall:    0.54
     F1-score:  0.54

