In [1]:
# pip install datasets
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_abstractive = load_dataset("sobamchan/aclsum", "abstractive", split="test")
dataset_extractive = load_dataset("sobamchan/aclsum", "extractive", split="test")

In [3]:
print(dataset_abstractive[0])
print(dataset_extractive[0])

{'id': 'E09-1056', 'document': 'Handling terminology is an important matter in a translation workflow . However , current Machine Translation ( MT ) systems do not yet propose anything proactive upon tools which assist in managing terminological databases . In this work , we investigate several enhancements to analogical learning and test our implementation on translating medical terms . We show that the analogical engine works equally well when translating from and into a morphologically rich language , or when dealing with language pairs written in different scripts . Combining it with a phrasebased statistical engine leads to significant improvements . If machine translation is to meet commercial needs , it must offer a sensible approach to translating terms . Currently , MT systems offer at best database management tools which allow a human ( typically a translator , a terminologist or even the vendor of the system ) to specify bilingual terminological entries . More advanced tools

In [4]:
print(dataset_extractive["source_sentences"][0])

['Handling terminology is an important matter in a translation workflow .', 'However , current Machine Translation ( MT ) systems do not yet propose anything proactive upon tools which assist in managing terminological databases .', 'In this work , we investigate several enhancements to analogical learning and test our implementation on translating medical terms .', 'We show that the analogical engine works equally well when translating from and into a morphologically rich language , or when dealing with language pairs written in different scripts .', 'Combining it with a phrasebased statistical engine leads to significant improvements .', 'If machine translation is to meet commercial needs , it must offer a sensible approach to translating terms .', 'Currently , MT systems offer at best database management tools which allow a human ( typically a translator , a terminologist or even the vendor of the system ) to specify bilingual terminological entries .', 'More advanced tools are mean

Let's import ROUGE F-1

In [5]:
#pip install evaluate absl-py nltk rouge-score
from evaluate import load

In [6]:
rouge = load("rouge")

In [7]:
# Possibilities for rouge parameter are:  "rouge1", "rouge2", "rougeL"
def rouge_score(candidate, reference, rougex):
    result = rouge.compute(
        predictions=[candidate],
        references=[reference],
        rouge_types=[rougex]
    )
    r1 = result[rougex]
    return r1

### Let's prepare the first heuristic: **Greed Search** 


In [42]:
def greedy_extractive_summary(sentences, abstractive_summary, max_sent=6):
    selected = []
    remaining = sentences[:]
    mask = [0] * len(sentences)

    while remaining and max_sent != 0: 
        best_sentence = None
        best_score = -1
        for sent in remaining:
            
            candidate_summary = " ".join(selected + [sent])
            r1 = rouge_score(candidate_summary, abstractive_summary, "rouge1")
            if r1 > best_score:
                best_score = r1
                best_sentence = sent
       
        selected.append(best_sentence)
        idx = sentences.index(best_sentence)
        mask[idx] = 1
        remaining.remove(best_sentence)
        max_sent -= 1
    
    return selected, mask

In [43]:
selected0, mask0 = greedy_extractive_summary(dataset_extractive["source_sentences"][6], dataset_abstractive["outcome"][6])

print(len(selected0))
print(len(dataset_extractive["outcome_sentences"][0]))

6
6


### Let's now define the loop function which computes all the extractive summaries given the abstractive ones (**only outcomes so far**)

In [44]:
def apply_heuristic_to_dataset(heuristic_fn, docs_sentences, abstractive_summaries):
    selected_list = []
    masks_list = []

    for sentences, abs_summary in zip(docs_sentences, abstractive_summaries):
        selected, mask = heuristic_fn(sentences, abs_summary)
        selected_list.append(selected)
        masks_list.append(mask)
    
    return selected_list, masks_list


In [45]:
selected_list, masks_list = apply_heuristic_to_dataset(greedy_extractive_summary, list(dataset_extractive["source_sentences"]), list(dataset_abstractive["outcome"]))

### Let's make a function which, for each document, concatenates the sentences we have extracted from the selected list

In [46]:
def sentence_concatenation(sentence_list):
    merged_sentences = [" ".join(inner_list) for inner_list in sentence_list]
    return merged_sentences

In [47]:
merged_sentences = sentence_concatenation(selected_list)

In [48]:
merged_labels = sentence_concatenation(list(dataset_extractive["outcome_sentences"]))

In [49]:
def evaluate_performance(predicted_summaries, label_summaries):
    # In case something went wrong
    if len(predicted_summaries) != len(label_summaries):
        return None
    
    r1_sum = 0
    for pred, label in zip(predicted_summaries, label_summaries):
        r1 = rouge_score(pred, label, "rouge1")
        r1_sum += r1
    
    return r1_sum / len(predicted_summaries)

In [50]:
r1 = evaluate_performance(merged_sentences, merged_labels)
print(f'The average ROUGE F-1 score for Greedy Search heuristic is: {round(r1, 5)}')

The average ROUGE F-1 score for Greedy Search heuristic is: 0.53512


This means that only **2.26%** of unigrams in the predicted summaries effectively overlap with the labels summaries. This value is extremely low let's see if other heuristics can work better.

### Let's do the same for **Beam Search**

In [52]:
def beam_extractive_summary(sentences, abstractive_summary, beam_size=3, max_sentences=6):
    # Tuple of three items:
    #   -   selected sentences array
    #   -   mask of selected sentences
    #   -   ROUGE F1 score
    beams = [([], [0] * len(sentences), 0.0)]
    
    while max_sentences > 0:
        new_beams = []
        for selected, mask, score in beams:
            remaining = [s for s in sentences if s not in selected]
            
            for sent in remaining:
                candidate_summary = " ".join(selected + [sent])
                r1 = rouge_score(candidate_summary, abstractive_summary, "rouge1")
                
                # Make a new mask for this candidate
                new_mask = mask[:]
                idx = sentences.index(sent)
                new_mask[idx] = 1
                
                new_beams.append((selected + [sent], new_mask, r1))
        
        # Sort by third value (ROUGE F1 score), highest score first
        new_beams.sort(key=lambda x: x[2], reverse=True)
        # Takes only the first beam_size elements
        beams = new_beams[:beam_size]
    
    best_selected, best_mask, _ = beams[0] # Best selection
    return best_selected, best_mask


In [None]:
selected_list, _ = apply_heuristic_to_dataset(beam_extractive_summary, list(dataset_extractive["source_sentences"]), list(dataset_abstractive["outcome"]))

In [None]:
r1_beam = evaluate_performance(merged_sentences, merged_labels)
print(f'The average ROUGE F-1 score for Greedy Search heuristic is: {round(r1, 5)}')

Implementation of extractive summarization methods combining large language models (LLMs) and heuristic sentence selection. Supports greedy and beam search strategies, lcaol and global search, multiple prompting techniques,and evaluation with ROUGE and exact-match metrics.