In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
!pip install openai transformers datasets rouge_score nltk



In [3]:
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from rouge_score import rouge_scorer
import re
import json


In [4]:
# Load dataset in exctractive mode
from datasets import load_dataset
dataset = load_dataset("sobamchan/aclsum", "extractive")["test"]
print(dataset[0].keys())

model_name = "Qwen/Qwen3-4B"

# Replace model name for Qwen or LLaMA
summarizer = pipeline("text-generation", model=model_name)

# replace tokenizer as needed as well
tokenizer = AutoTokenizer.from_pretrained(model_name)

gen = pipeline("text-generation", model=model_name, tokenizer=tokenizer, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


dict_keys(['id', 'source_sentences', 'challenge_sentences', 'approach_sentences', 'outcome_sentences', 'challenge_labels', 'approach_labels', 'outcome_labels'])


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


In [5]:
print(dataset)

Dataset({
    features: ['id', 'source_sentences', 'challenge_sentences', 'approach_sentences', 'outcome_sentences', 'challenge_labels', 'approach_labels', 'outcome_labels'],
    num_rows: 100
})


In [6]:
print(dataset['challenge_labels'][0])

[1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### preprocess into sentence label pair

This step is necessary as it will allow us to match the sentence itself with the label. This means that for every possible label (challenge, approach, outcome) we will store the sentences with their proper label in order to be able to evaluate the outcome accordingly.

This means that every phrase in the source_sentences section of the dataset will contain a label as well as the aspect. So, the first sentence might be assigned label 1 for aspect challenge and label 0 for aspects outcome and approach.

In [7]:
def prepare_aspect_data(example, aspect):
    return [
        {"sentence": sent, "label": lab, "aspect": aspect}
        for sent, lab in zip(example["source_sentences"], example[f"{aspect}_labels"])
    ]

sample = prepare_aspect_data(dataset[0], "challenge")
print(sample[:3])

In [8]:
sample_1 = prepare_aspect_data(dataset[0], "outcome")
print(sample_1[:3])

In [9]:
sample_2 = prepare_aspect_data(dataset[0], "approach")
print(sample_2[:3])

To apply this to the whole dataset we can use the following code snippet

In [10]:
def add_triplets(example):
    aspects = ["challenge", "approach", "outcome"]
    # collect labels for each aspect
    aspect_labels = {a: [lab for _, lab in zip(example["source_sentences"], example[f"{a}_labels"])]
                     for a in aspects}
    # combine into triplets
    triplets = []
    for i, sent in enumerate(example["source_sentences"]):
        triplet = [aspect_labels["challenge"][i],
                   aspect_labels["approach"][i],
                   aspect_labels["outcome"][i]]
        triplets.append(triplet)
    example["triplets"] = triplets
    return example

labeled_dataset = dataset.map(add_triplets)

# quick peek
print(labeled_dataset[0]["source_sentences"][:10])
print(labeled_dataset[0]["triplets"][:10])


In [11]:
from datasets import load_dataset

# Function: expand one document into list of sentence dicts
def expand_doc(example):
    return {
        "sentences": [
            {
                "sentence": sent,
                "label_ch": int(ch),
                "label_ap": int(ap),
                "label_oc": int(oc),
            }
            for sent, ch, ap, oc in zip(
                example["source_sentences"],
                example["challenge_labels"],
                example["approach_labels"],
                example["outcome_labels"]
            )
        ]
    }

expanded = dataset.map(expand_doc)

print(expanded[1]["sentences"][:3])
print(f'check length: {len(expanded[1]["sentences"])} {len(dataset[1]["source_sentences"])}')


## prompting techniques

the vanilla prompt has only the task to select the most important sentences regardless of the aspect

In [None]:
def simple_vanilla_prompt(sentences):
    """
    Generates a simple, general-purpose vanilla prompt for extractive summarization.
    """
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select the most important sentences from the document.

Input:
{input_text}

Rules:
- Select ONLY the most important sentences.
- If no sentences are important, return an empty list.
- Indices are 1-based.
- Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

this second vanilla prompt instead selects the most important phrases based on the aspect, so which are the most important phrases connected to challenge, approach and outcome

In [12]:
# vanilla prompt
# needs to be called three times on the same phrase to understand the three aspects
def vanilla_prompt(sentences, target_label):
    # target_label ∈ {"challenge","approach","outcome"}
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)
    return f'''You are an expert in extractive summarization. Your task is to select sentences that express the "{target_label}" aspect of the document.

Aspect definitions:
- challenge: problem, gap, limitation, unmet need, difficulty/motivation.
- approach: method, model, system, algorithm, dataset design, procedure.
- outcome: results, findings, improvements, metrics, performance, impact.

Input:
{input_text}

Rules:
- Select ONLY sentences that primarily express the "{target_label}" aspect.
- If none match, return an empty list.
- Indices are 1-based.
- Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''


Least to most prompting technique implemented both in its aspect-based version and in the simple version. Ask the model to identify the overall purpose of the document before returning either the aspect-based sentences or the overall most important sentences.

In [None]:
def least_to_most_prompt(sentences, target_label):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select sentences that express the "{target_label}" aspect of the document.

Aspect definitions:
- challenge: problem, gap, limitation, unmet need, difficulty/motivation.
- approach: method, model, system, algorithm, dataset design, procedure.
- outcome: results, findings, improvements, metrics, performance, impact.

Input:
{input_text}

First, consider the overall purpose of the document and how each sentence contributes to it.
Then, from that understanding, select ONLY sentences that primarily express the "{target_label}" aspect.
If none match, return an empty list.
Indices are 1-based.
Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

In [None]:
def least_to_most_simple_prompt(sentences):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select the most important sentences from the document.

Input:
{input_text}

First, identify the main topics and key arguments of the document.
Then, select ONLY the sentences that directly relate to those topics.
If no sentences are important, return an empty list.
Indices are 1-based.
Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

Tool-augmented prompting. Here the model is instructed to use a "tool" or internal function to aid its reasoning. The tool is not a real external program but a conceptual instruction within the prompt itself.



In [None]:
def tool_augmented_prompt(sentences, target_label):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select sentences that express the "{target_label}" aspect of the document.

Aspect definitions:
- challenge: problem, gap, limitation, unmet need, difficulty/motivation.
- approach: method, model, system, algorithm, dataset design, procedure.
- outcome: results, findings, improvements, metrics, performance, impact.

Input:
{input_text}

Instructions:
1. For each sentence, use the internal `check_aspect(sentence, aspect)` tool.
2. The tool's output is 'match' if the sentence primarily describes the "{target_label}" aspect, otherwise it is 'no_match'.
3. List the sentences that result in a 'match'.
4. If no sentences match, return an empty list.
5. Indices are 1-based.
6. Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

In [None]:
def tool_augmented_simple_prompt(sentences):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select the most important sentences from the document.

Input:
{input_text}

Instructions:
1. For each sentence, use the internal `check_importance(sentence)` tool.
2. The tool's output is 'important' if the sentence is central to the main idea, otherwise it is 'not_important'.
3. List the sentences that result in an 'important' output.
4. If no sentences are important, return an empty list.
5. Indices are 1-based.
6. Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

scoring-based prompting instead gives a score to each sentence based on the relevance to the task. Also here we have the aspect-based version and the simple, importance-based one.

In [None]:
def scoring_based_prompt(sentences, target_label):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select sentences that express the "{target_label}" aspect of the document.

Aspect definitions:
- challenge: problem, gap, limitation, unmet need, difficulty/motivation.
- approach: method, model, system, algorithm, dataset design, procedure.
- outcome: results, findings, improvements, metrics, performance, impact.

Input:
{input_text}

Instructions:
1. For each sentence, assign a score from 1 (low relevance) to 5 (high relevance) for how well it expresses the "{target_label}" aspect.
2. Only select sentences with a score of 4 or 5.
3. If no sentences meet the threshold, return an empty list.
4. Indices are 1-based.
5. Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

In [None]:
def scoring_based_simple_prompt(sentences):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select the most important sentences from the document.

Input:
{input_text}

Instructions:
1. For each sentence, assign a score from 1 (low importance) to 5 (high importance) for how central it is to the document's main idea.
2. Only select sentences with a score of 4 or 5.
3. If no sentences meet the threshold, return an empty list.
4. Indices are 1-based.
5. Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

self-ask prompting involves the model reasoning through a series of yes-no questions and using the answers to reach a conclustion.

In [None]:
def self_ask_prompt(sentences, target_label):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select sentences that express the "{target_label}" aspect of the document.

Aspect definitions:
- challenge: problem, gap, limitation, unmet need, difficulty/motivation.
- approach: method, model, system, algorithm, dataset design, procedure.
- outcome: results, findings, improvements, metrics, performance, impact.

Input:
{input_text}

Instructions:
1. Reason step-by-step. For each sentence, ask the question: "Does this sentence primarily express the "{target_label}" aspect?"
2. Answer the question with "Yes" or "No".
3. Compile a list of all sentences for which the answer was "Yes".
4. If no sentences meet the criteria, return an empty list.
5. Indices are 1-based.
6. Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

In [None]:
def self_ask_simple_prompt(sentences):
    numbered = [f"Sentence {i+1}: {s}" for i, s in enumerate(sentences)]
    input_text = "\n".join(numbered)

    return f'''You are an expert in extractive summarization. Your task is to select the most important sentences from the document.

Input:
{input_text}

Instructions:
1. Reason step-by-step. First, ask the question: "What is the main idea of this document?"
2. Then, for each sentence, ask: "Does this sentence support the main idea?"
3. Compile a list of all sentences for which the answer was "Yes".
4. If no sentences are important, return an empty list.
5. Indices are 1-based.
6. Return ONLY valid JSON.

Return format:
{{"selected_sentences": [list_of_sentence_numbers]}}'''

Transform gold standard to match output for each label:

so for example if the challenge_labels looks like this:

[1,0,0,1,1,0]

it will become

[1, 4, 5]

In [13]:
# I want also the gold standard to match the same output
ASPECTS = ["challenge", "approach", "outcome"]

def labels_to_indices(labels):
    """0/1 list -> 1-based indices of 1s"""
    return [i+1 for i, v in enumerate(labels) if int(v) == 1]

def gold_for_doc(example):
    """
    Build gold indices per aspect for ONE document.
    Returns: {"challenge":[...], "approach":[...], "outcome":[...]}
    """
    return {
        "challenge": labels_to_indices(example["challenge_labels"]),
        "approach":  labels_to_indices(example["approach_labels"]),
        "outcome":   labels_to_indices(example["outcome_labels"]),
    }

def gold_for_dataset(ds):
    """
    Build gold indices per aspect for ALL docs.
    Returns a list aligned with ds, where item i is gold_for_doc(ds[i])
    """
    return [gold_for_doc(ex) for ex in ds]

gold_all = gold_for_dataset(dataset)
print(gold_all[0])


{'challenge': [1, 2, 7, 11], 'approach': [3, 15, 19, 26, 27], 'outcome': [4, 5, 20, 21, 30, 31]}


In [14]:
# --- JSON parsing that tolerates extra text ---
def safe_extract_json(text: str):
    text = text.strip()
    # try direct
    try:
        return json.loads(text)
    except Exception:
        pass
    # try to find {...}
    m = re.search(r"\{.*\}", text, flags=re.S)
    if m:
        try:
            return json.loads(m.group(0))
        except Exception:
            return {}
    return {}

In [15]:
import json, re
from transformers import AutoTokenizer, pipeline

In [16]:
import json, re

def safe_extract_json_strict(text: str):
    """
    Strictly extract the model's JSON:
      - Prefer a JSON object that contains "selected_sentences".
      - Else accept a SINGLE standalone bracketed list on its own line.
      - Otherwise return {} (no guesses; avoids capturing numbers from the prompt).
    """
    text = text.strip()

    # 1) Try direct JSON parse (object or list)
    try:
        js = json.loads(text)
        if isinstance(js, dict) and "selected_sentences" in js:
            return js
        if isinstance(js, list):
            return {"selected_sentences": js}
    except Exception:
        pass

    # 2) Strip typical code fences like ```json ... ```
    text_clean = re.sub(r"^```[\w-]*\s*\n", "", text, flags=re.S)
    text_clean = re.sub(r"\n```$", "", text_clean, flags=re.S).strip()

    # 3) Find the LAST JSON object that mentions "selected_sentences"
    objs = re.findall(r"\{[\s\S]*?\}", text_clean)
    for s in reversed(objs):
        try:
            js = json.loads(s)
            if isinstance(js, dict) and "selected_sentences" in js:
                return js
        except Exception:
            continue

    # 4) Accept a STANDALONE list on its own line (avoids grabbing numbers from "Sentence 1:")
    m = re.search(r"(?m)^\s*\[(?:\s*\d+\s*(?:,\s*\d+\s*)*)?\]\s*$", text_clean)
    if m:
        try:
            arr = json.loads(m.group(0))
            return {"selected_sentences": arr}
        except Exception:
            pass

    # 5) Give up safely (do NOT try to collect arbitrary digits)
    return {}


In [17]:
# def predict_indices_for_aspect(sentences, target_label, max_new_tokens=256):
#     prompt = vanilla_prompt(sentences, target_label)

#     out = gen(
#         prompt,
#         max_new_tokens=max_new_tokens,
#         do_sample=False,         # deterministic baseline
#         temperature=0.0,
#         return_full_text=False,
#     )[0]["generated_text"]

#     js = safe_extract_json(out)
#     idxs = js.get("selected_sentences", [])
#     # sanitize: keep ints in range [1..N], unique + sorted
#     n = len(sentences)
#     cleaned = []
#     for v in idxs:
#         if isinstance(v, (int, float)):
#             v = int(v)
#             if 1 <= v <= n:
#                 cleaned.append(v)
#     return sorted(set(cleaned))

# def predict_indices_for_aspect(sentences, target_label, max_new_tokens=256, show_raw=False):
#     prompt = vanilla_prompt(sentences, target_label)

#     # Deterministic baseline: no sampling => no temperature arg
#     out = summarizer(
#         prompt,
#         max_new_tokens=max_new_tokens,
#         do_sample=False,
#         return_full_text=False
#     )[0]["generated_text"]

#     if show_raw:
#         print(f"\n[RAW OUTPUT for {target_label}]\n{out}\n")

#     js = safe_extract_json_strict(out)  # <-- use stronger parser below
#     idxs = js.get("selected_sentences", [])

#     # sanitize: keep ints within [1..N], unique & sorted
#     n = len(sentences)
#     cleaned = []
#     for v in idxs:
#         if isinstance(v, (int, float)):
#             v = int(v)
#             if 1 <= v <= n:
#                 cleaned.append(v)
#     return sorted(set(cleaned))
# Uses your existing: vanilla_prompt, summarizer (HF pipeline), and safe_extract_json_strict
# No temperature (since do_sample=False), and chat formatting for Qwen.

def predict_indices_for_aspect(sentences, target_label, max_new_tokens=256, show_raw=False):
    user_prompt = vanilla_prompt(sentences, target_label)

    # 2) Format as chat for Qwen (critical!)
    try:
        chat_text = tokenizer.apply_chat_template(
            [
                {"role": "system", "content": "You are an expert in extractive summarization."},
                {"role": "user",   "content": user_prompt}
            ],
            tokenize=False,
            add_generation_prompt=True,
        )
    except NameError:
        # Fallback if tokenizer isn't in scope: just use user_prompt (less reliable)
        chat_text = user_prompt

    # 3) Make sure padding is defined (some Qwen checkpoints need this)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token

    # 4) Generate deterministically
    out = summarizer(
        chat_text,
        max_new_tokens=max_new_tokens,
        do_sample=False,           # deterministic baseline
        return_full_text=False
    )[0]["generated_text"]

    if show_raw:
        print(f"\n[RAW OUTPUT for {target_label}]\n{out}\n")

    # 5) Strict JSON parse (prevents grabbing “Sentence 1..N”)
    js = safe_extract_json_strict(out)
    idxs = js.get("selected_sentences", [])

    # 6) Sanitize: ints, in-range, uniq, sorted
    n = len(sentences)
    cleaned = []
    for v in idxs:
        if isinstance(v, (int, float)):
            v = int(v)
            if 1 <= v <= n:
                cleaned.append(v)
    return sorted(set(cleaned))



In [18]:
def predict_doc(sentences):
    preds = {}
    for a in ASPECTS:
        preds[a] = predict_indices_for_aspect(sentences, a)
    return preds

In [19]:
def binarize_from_indices(n, idxs_1b):
    y = [0]*n
    for i in idxs_1b:
        if 1 <= i <= n:
            y[i-1] = 1
    return y

def prf1(y_true, y_pred):
    import numpy as np
    yt = np.array(y_true, dtype=int)
    yp = np.array(y_pred, dtype=int)
    tp = int(((yt==1)&(yp==1)).sum())
    fp = int(((yt==0)&(yp==1)).sum())
    fn = int(((yt==1)&(yp==0)).sum())
    p = tp/(tp+fp) if (tp+fp) else 0.0
    r = tp/(tp+fn) if (tp+fn) else 0.0
    f = 2*p*r/(p+r) if (p+r) else 0.0
    return {"precision": p, "recall": r, "f1": f}


In [20]:
# -----------------------------
# PREDICTION LOOP (print only)
# -----------------------------
def run_predictions(dataset, start=0, stop=3, print_sentences=False, show_raw=False):
    """
    Runs the 3-pass vanilla prompt for docs in [start:stop) and prints:
      - predicted indices per aspect
      - gold indices per aspect
      - optionally, the selected sentences
    """
    n_docs = len(dataset)
    stop = min(stop, n_docs)

    for doc_idx in range(start, stop):
        ex = dataset[doc_idx]
        sents = ex["source_sentences"]
        gold = gold_for_doc(ex)  # uses your helper

        print("="*80)
        print(f"Doc {doc_idx} (id={ex.get('id', 'NA')})  |  #sentences={len(sents)}")
        print("-"*80)

        for aspect in ASPECTS:
            # --- 3-pass vanilla prompt call (your helper)
            preds = predict_indices_for_aspect(
                sents,
                aspect,
                max_new_tokens=256,
                # Keep the function signature identical to what you already have:
                # do_sample=False, temperature=0.0, return_full_text=False are inside that helper
            )

            print(f"{aspect.capitalize():9s} | Pred: {preds} | Gold: {gold[aspect]}")

            if print_sentences and preds:
                print(f"Selected {aspect} sentences:")
                for i in preds:
                    # 1-based -> 0-based
                    if 1 <= i <= len(sents):
                        print(f"  [{i}] {sents[i-1]}")
                print()

        print()  # spacer


run_predictions(dataset, start=0, stop=1, print_sentences=False, show_raw=False)


Doc 0 (id=E09-1056)  |  #sentences=44
--------------------------------------------------------------------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


OutOfMemoryError: CUDA out of memory. Tried to allocate 74.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 6.12 MiB is free. Process 302201 has 14.73 GiB memory in use. Of the allocated memory 14.51 GiB is allocated by PyTorch, and 98.59 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
stop

In [None]:
sents = dataset["source_sentences"][0]
gold = gold_all[0]
pred = predict_doc(sents)

print(gold)
print(pred)

for a in ASPECTS:
    n = len(sents)
    y_true = binarize_from_indices(n, gold[a])
    y_pred = binarize_from_indices(n, pred[a])
    m = prf1(y_true, y_pred)

In [None]:
sents = ex["source_sentences"]
n = len(sents)

# Gold
gold_idx = labels_to_indices(ex["challenge_labels"])  # e.g. [1,2,5]
y_true = binarize_from_indices(n, gold_idx)

# Predicted (from your model prompt)
pred_idx = [1,5]   # <-- for example
y_pred = binarize_from_indices(n, pred_idx)

m = prf1(y_true, y_pred)
print(m)