In [2]:
# First upgrade pip
%pip install --upgrade pip

# Install tensorflow and keras first
%pip install tensorflow==2.18.0 keras==3.9.0

# Install torch and torchdata
%pip install torch==2.6.0 torchdata

# Then install other packages except TRL
%pip install -U \
    datasets==2.17.0 \
    transformers==4.41.0

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig

In [4]:
huggingface_dataset_name = "abisee/cnn_dailymail"

dataset = load_dataset(huggingface_dataset_name, "3.0.0")
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [5]:
example_indices = [40, 200]

dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(example_indices):
    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print('INPUT DIALOGUE:')
    print(dataset['test'][index]['article'])
    print(dash_line)
    print('BASELINE HUMAN SUMMARY:')
    print(dataset['test'][index]['highlights'])
    print(dash_line)
    print()

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT DIALOGUE:
(CNN)A high temperature of 63.5 degrees Fahrenheit might sound like a pleasant day in early spring -- unless you're in Antarctica. The chilly continent recorded the temperature (15.5 degrees Celsius) on March 24, possibly the highest ever recorded on Antarctica, according to the Weather Underground. The temperature was recorded at Argentina's Esperanza Base on the northern tip of the Antarctica Peninsula, according to CNN affiliate WTNH. (Note to map lovers: The Argentine base is not geographically part of the South American continent.) The World Meteorological Organization, a specialized United Nations agency, is in the process of setting up an international ad-hoc committee of about 10 blue-ribbon climatologists and meteorologists to begin collecting relevant e

In [6]:
model_name='google/flan-t5-base'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [8]:
test_sentence = "It is not our abilities that show who we truly are, it is our choices"

encoded_test_sentence = tokenizer(test_sentence, return_tensors='pt')
encoded_test_sentence

{'input_ids': tensor([[  94,   19,   59,   69, 8075,   24,  504,  113,   62, 1892,   33,    6,
           34,   19,   69, 3703,    1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [9]:
decoded_test_sentence = tokenizer.decode(
      encoded_test_sentence['input_ids'][0],
      skip_special_tokens=True
    )
decoded_test_sentence

'It is not our abilities that show who we truly are, it is our choices'

# Summrization without prompt engineering

In [10]:
for i, index in enumerate(example_indices):
  article = dataset['test'][index]['article']
  highlights = dataset['test'][index]['highlights']

  encoded_article = tokenizer(
      article,
      return_tensors='pt'
  )

  encoded_model_highlights = model.generate(
      encoded_article['input_ids'],
      max_new_tokens=100
  )

  model_highlights = tokenizer.decode(
      encoded_model_highlights[0],
      skip_special_tokens=True
  )

  print(dash_line)
  print('Example ', i + 1)
  print(dash_line)
  print(f'INPUT PROMPT:\n{article}')
  print(dash_line)
  print(f'BASELINE HUMAN SUMMARY:\n{highlights}')
  print(dash_line)
  print(f'MODEL GENERATION - WITHOUT PROMPT ENGINEERING:\n{model_highlights}\n')

Token indices sequence length is longer than the specified maximum sequence length for this model (585 > 512). Running this sequence through the model will result in indexing errors


---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:
(CNN)A high temperature of 63.5 degrees Fahrenheit might sound like a pleasant day in early spring -- unless you're in Antarctica. The chilly continent recorded the temperature (15.5 degrees Celsius) on March 24, possibly the highest ever recorded on Antarctica, according to the Weather Underground. The temperature was recorded at Argentina's Esperanza Base on the northern tip of the Antarctica Peninsula, according to CNN affiliate WTNH. (Note to map lovers: The Argentine base is not geographically part of the South American continent.) The World Meteorological Organization, a specialized United Nations agency, is in the process of setting up an international ad-hoc committee of about 10 blue-ribbon climatologists and meteorologists to begin collecting relevant evi

# Zero Shot Prompting

In [11]:
for i, index in enumerate(example_indices):
    article = dataset['test'][index]['article']
    highlights = dataset['test'][index]['highlights']

    prompt = f"""
Summarize the key highlights from the following article.

{article}

Highlights:
    """

    # Input constructed prompt instead of the dialogue.
    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=100,
        )[0],
        skip_special_tokens=True
    )

    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT PROMPT:\n{prompt}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{highlights}')
    print(dash_line)
    print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n')

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the key highlights from the following article.

(CNN)A high temperature of 63.5 degrees Fahrenheit might sound like a pleasant day in early spring -- unless you're in Antarctica. The chilly continent recorded the temperature (15.5 degrees Celsius) on March 24, possibly the highest ever recorded on Antarctica, according to the Weather Underground. The temperature was recorded at Argentina's Esperanza Base on the northern tip of the Antarctica Peninsula, according to CNN affiliate WTNH. (Note to map lovers: The Argentine base is not geographically part of the South American continent.) The World Meteorological Organization, a specialized United Nations agency, is in the process of setting up an international ad-hoc committee of about 10 blue-ribbon climato

In [12]:
for i, index in enumerate(example_indices):
    article = dataset['test'][index]['article']
    highlights = dataset['test'][index]['highlights']

    prompt = f"""
Article:

{article}

What was going on?
    """

    # Input constructed prompt instead of the dialogue.
    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=100,
        )[0],
        skip_special_tokens=True
    )

    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT PROMPT:\n{prompt}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{highlights}')
    print(dash_line)
    print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n')

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Article:

(CNN)A high temperature of 63.5 degrees Fahrenheit might sound like a pleasant day in early spring -- unless you're in Antarctica. The chilly continent recorded the temperature (15.5 degrees Celsius) on March 24, possibly the highest ever recorded on Antarctica, according to the Weather Underground. The temperature was recorded at Argentina's Esperanza Base on the northern tip of the Antarctica Peninsula, according to CNN affiliate WTNH. (Note to map lovers: The Argentine base is not geographically part of the South American continent.) The World Meteorological Organization, a specialized United Nations agency, is in the process of setting up an international ad-hoc committee of about 10 blue-ribbon climatologists and meteorologists to begin collecting r

# One Shot Prompting

In [13]:
def make_prompt(example_indices, example_index_to_summarize):
    prompt = ''

    for index in example_indices:
        article = dataset['test'][index]['article']
        highlights = dataset['test'][index]['highlights']

        prompt += f"""
Summarize the key highlights from this article:

{article}

Highlights:

{highlights}
"""

    article = dataset['test'][example_index_to_summarize]['article']
    prompt += f"""
Summarize the key highlights from this article:

{article}

Highlights:
"""

    return prompt

In [14]:
example_indices = [40]
example_index_to_summarize = 200

one_shot_prompt = make_prompt(example_indices, example_index_to_summarize)

print(one_shot_prompt)


Summarize the key highlights from this article:

(CNN)A high temperature of 63.5 degrees Fahrenheit might sound like a pleasant day in early spring -- unless you're in Antarctica. The chilly continent recorded the temperature (15.5 degrees Celsius) on March 24, possibly the highest ever recorded on Antarctica, according to the Weather Underground. The temperature was recorded at Argentina's Esperanza Base on the northern tip of the Antarctica Peninsula, according to CNN affiliate WTNH. (Note to map lovers: The Argentine base is not geographically part of the South American continent.) The World Meteorological Organization, a specialized United Nations agency, is in the process of setting up an international ad-hoc committee of about 10 blue-ribbon climatologists and meteorologists to begin collecting relevant evidence, said Randy Cerveny, the agency's lead rapporteur of weather and climate extremes and Arizona State University professor of geographical sciences. The committee will exa

In [15]:
# Input constructed prompt instead of the dialogue.
inputs = tokenizer(one_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
      model.generate(
          inputs["input_ids"],
          max_new_tokens=100,
      )[0],
      skip_special_tokens=True
  )

highlights = dataset['test'][example_index_to_summarize]['highlights']

print(dash_line)
print('Example prompt')
print(dash_line)
print(f'INPUT PROMPT:\n{one_shot_prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{highlights}')
print(dash_line)
print(f'MODEL GENERATION - ONE SHOT:\n{output}\n')

---------------------------------------------------------------------------------------------------
Example prompt
---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the key highlights from this article:

(CNN)A high temperature of 63.5 degrees Fahrenheit might sound like a pleasant day in early spring -- unless you're in Antarctica. The chilly continent recorded the temperature (15.5 degrees Celsius) on March 24, possibly the highest ever recorded on Antarctica, according to the Weather Underground. The temperature was recorded at Argentina's Esperanza Base on the northern tip of the Antarctica Peninsula, according to CNN affiliate WTNH. (Note to map lovers: The Argentine base is not geographically part of the South American continent.) The World Meteorological Organization, a specialized United Nations agency, is in the process of setting up an international ad-hoc committee of about 10 blue-ribbon climatologis

In [16]:
example_indices = [40, 270, 340, 400]
example_index_to_summarize = 200

few_shot_prompt = make_prompt(example_indices, example_index_to_summarize)

print(few_shot_prompt)


Summarize the key highlights from this article:

(CNN)A high temperature of 63.5 degrees Fahrenheit might sound like a pleasant day in early spring -- unless you're in Antarctica. The chilly continent recorded the temperature (15.5 degrees Celsius) on March 24, possibly the highest ever recorded on Antarctica, according to the Weather Underground. The temperature was recorded at Argentina's Esperanza Base on the northern tip of the Antarctica Peninsula, according to CNN affiliate WTNH. (Note to map lovers: The Argentine base is not geographically part of the South American continent.) The World Meteorological Organization, a specialized United Nations agency, is in the process of setting up an international ad-hoc committee of about 10 blue-ribbon climatologists and meteorologists to begin collecting relevant evidence, said Randy Cerveny, the agency's lead rapporteur of weather and climate extremes and Arizona State University professor of geographical sciences. The committee will exa

In [17]:
# Input constructed prompt instead of the dialogue.
inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
      model.generate(
          inputs["input_ids"],
          max_new_tokens=100,
      )[0],
      skip_special_tokens=True
  )

highlights = dataset['test'][example_index_to_summarize]['highlights']

print(dash_line)
print('Example Prompt')
print(dash_line)
print(f'INPUT PROMPT:\n{few_shot_prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{highlights}')
print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}\n')

---------------------------------------------------------------------------------------------------
Example Prompt
---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the key highlights from this article:

(CNN)A high temperature of 63.5 degrees Fahrenheit might sound like a pleasant day in early spring -- unless you're in Antarctica. The chilly continent recorded the temperature (15.5 degrees Celsius) on March 24, possibly the highest ever recorded on Antarctica, according to the Weather Underground. The temperature was recorded at Argentina's Esperanza Base on the northern tip of the Antarctica Peninsula, according to CNN affiliate WTNH. (Note to map lovers: The Argentine base is not geographically part of the South American continent.) The World Meteorological Organization, a specialized United Nations agency, is in the process of setting up an international ad-hoc committee of about 10 blue-ribbon climatologis

In [22]:
# Input constructed prompt instead of the dialogue.

generation_config = GenerationConfig(
    temperature=0.1,
    do_sample=True,
    max_new_tokens=100,
    decoder_start_token_id=tokenizer.pad_token_id
)

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
      model.generate(
          inputs["input_ids"],
          generation_config=generation_config
      )[0],
      skip_special_tokens=True
  )

highlights = dataset['test'][example_index_to_summarize]['highlights']

print(dash_line)
print('Example Prompt')
print(dash_line)
print(f'INPUT PROMPT:\n{few_shot_prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{highlights}')
print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}\n')

---------------------------------------------------------------------------------------------------
Example Prompt
---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the key highlights from this article:

(CNN)A high temperature of 63.5 degrees Fahrenheit might sound like a pleasant day in early spring -- unless you're in Antarctica. The chilly continent recorded the temperature (15.5 degrees Celsius) on March 24, possibly the highest ever recorded on Antarctica, according to the Weather Underground. The temperature was recorded at Argentina's Esperanza Base on the northern tip of the Antarctica Peninsula, according to CNN affiliate WTNH. (Note to map lovers: The Argentine base is not geographically part of the South American continent.) The World Meteorological Organization, a specialized United Nations agency, is in the process of setting up an international ad-hoc committee of about 10 blue-ribbon climatologis