In [None]:
!pip install transformers

In [9]:
!pip install sentencepiece

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m797.5 kB/s[0m eta [36m0:00:00[0m0:01[0m00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


In [None]:
!pip install openai

In [1]:
import openai
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from transformers import pipeline
from transformers import BartTokenizer, PegasusTokenizer
from transformers import BartForConditionalGeneration, PegasusForConditionalGeneration


import json
from tqdm.notebook import tqdm

2023-04-03 23:03:30.595587: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-03 23:03:31.547479: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/intel/compilers_and_libraries_2018.1.163/linux/tbb/lib/intel64_lin/gcc4.7:/opt/intel/compilers_and_libraries_2018.1.163/linux/compiler/lib/intel64_lin:/opt/intel/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64_lin:/opt/intel/compilers_and_libraries_2018.1.163/linux/tbb/lib/intel64_lin/gcc4.7:/opt/intel/compilers_and_libraries_2018.1.163/linux/compiler/lib/intel64_lin:/opt/intel/compil

## Load sampled datasets

In [2]:
def load_data(path):
    with open(path) as f:
        return json.load(f)
    
cnn = load_data('data/cnn_sample.json')
xsum = load_data('data/xsum_sample.json')
newsroom = load_data('data/newsroom_sample.json')

In [3]:
cnn[0]

{'text': 'Johannesburg (CNN)He checked the series of stills on his camera. It was then that photographer James Oatway realized the entire attack had taken less than two minutes. It was the morning after a night of unrest in Johannesburg\'s Alexandra Township that saw foreign-owned shops looted and destroyed. Mozambican Emmanuel Sithole was walking down a street when four South Africans surrounded him. Sithole pleaded for mercy, but it was already too late. The attackers bludgeoned him with a wrench, stabbed him with knives, all in broad daylight. And Oatway had captured it all on his camera. "They looked like hardened thugs, just by their intensity, the way they moved, the expressions on their faces," Oatway told CNN. "They wanted one thing and that was to kill Emmanuel. They wanted his blood and nothing was going to stop them from doing that." Oatway says he tried to get as close as possible, conscious that the attackers were aware of his presence. "When the attack started I was 20 me

In [4]:
xsum[0]

{'text': "Tulip has been covering the outbreak over the past few months, reporting on the children born with microcephaly and other birth defects believed to be caused by the virus as well as speaking to scientists who are leading the fight back against Zika. Many of you got in touch on Facebook to ask Tulip questions about the new vaccine, and when it might be available to pregnant women. This is an edited version of the Facebook Q&A. Question from Dawn McLean: Have you tested the vaccine already? How do you know if it works or not? Tulip answers: Human trials haven't started for the vaccine being developed by the National Institutes of Health - where I have been reporting from this week. They are due to start - assuming no major problems - in the summer/ autumn of this year. If the vaccine is found to be safe with no concerning side effects, the drug will be given to a larger number of people to test how effective it is and further evaluate its safety. The final phase is giving it to

# Summarizatino models

## BART

In [3]:
bart = pipeline("summarization", model="facebook/bart-large-cnn", device=0)

In [4]:
def run_bart(text, min_length, max_length):
    return bart(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']

In [5]:
%%time

print(run_bart(cnn[0]['text'], max_length=60, min_length=20))

Mozambican Emmanuel Sithole was attacked by four South Africans in Johannesburg. Photographer James Oatway captured the entire attack on his camera. "They wanted his blood and nothing was going to stop them from doing that," he says. Seven people have been killed in the latest
CPU times: user 1.8 s, sys: 72.3 ms, total: 1.87 s
Wall time: 2.1 s


## BRIO

In [16]:
DEVICE = 'cuda:0'

brio_cnn = BartForConditionalGeneration.from_pretrained('Yale-LILY/brio-cnndm-uncased').to(DEVICE)
brio_cnn_tokenizer = BartTokenizer.from_pretrained('Yale-LILY/brio-cnndm-uncased')



In [17]:
brio = PegasusForConditionalGeneration.from_pretrained('Yale-LILY/brio-xsum-cased').to(DEVICE)
brio_tokenizer = PegasusTokenizer.from_pretrained('Yale-LILY/brio-xsum-cased')

In [21]:
def run_brio_cnn(text):
    inputs = brio_cnn_tokenizer([text.lower()], max_length=1024, 
                                return_tensors="pt", truncation=True).to(DEVICE)
    summary_ids = brio_cnn.generate(inputs["input_ids"])
    return brio_cnn_tokenizer.batch_decode(summary_ids, skip_special_tokens=True, 
                                  clean_up_tokenization_spaces=False)[0]

def run_brio(text):
    inputs = brio_tokenizer([text], max_length=512, return_tensors="pt", truncation=True).to(DEVICE)
    summary_ids = brio.generate(inputs["input_ids"])
    return brio_tokenizer.batch_decode(summary_ids, skip_special_tokens=True, 
                                  clean_up_tokenization_spaces=False)[0]

In [22]:
%%time

run_brio_cnn(cnn[0]['text'])

CPU times: user 1.91 s, sys: 7.95 ms, total: 1.92 s
Wall time: 1.91 s


'james oatway captured photos of a south africa attack on a mozambican man in johannesburg. the photographer captured the attack on his camera in less than two minutes. seven people have been killed in the latest round of xenophobic violence against immigrants. police have arrested four suspects in the attack.'

In [23]:
%%time
run_brio(xsum[0]['text'])

CPU times: user 1.72 s, sys: 212 µs, total: 1.72 s
Wall time: 1.91 s


'The World Health Organization (WHO) has declared the Zika virus outbreak in Brazil a global public health emergency, and a new vaccine is being developed.'

# LLMs

In [64]:
# prefix
NEWSROOM_PROMPT_PRE = "Summarize the article below in 2 to 3 sentences:"
CNN_PROMPT_PRE = "Summarize the article below in 3 to 4 sentences:"
XSUM_PROMPT_PRE = "Summarize the article below in 1 sentence:"

# for open ai
NEWSROOM_PROMPT_POST = "Summarize the above article briefly in 2 to 3 sentences."
CNN_PROMPT_POST = "Summarize the above article briefly in 3 to 4 sentences."
XSUM_PROMPT_POST = "Summarize the above article briefly in 1 sentence."


#eleuther
REGULAR_SUMMARY_POST = "Summary:"

# style, for open ai
NEWSROOM_PROMPT_STYLE_POST = "Generate a brief and engaging 2 to 3 sentence summary for the article above."
CNN_PROMPT_STYLE_POST = "Generate a brief and engaging 3 to 4 sentence summary for the article above."
XSUM_PROMPT_STYLE_POST = "Generate a brief and engaging 1 sentence summary for the article above."

In [51]:
def get_prompt(text, prompt, pre=True, end='\n\n'):
    if pre:
        return f"{prompt}\n\n{text}{end}"
    else:
        return f"{text}\n\n{prompt}{end}"

## Eleuther

In [6]:
import torch

eleuther_model_name = "gpt-neo-1.3B"

eleuther_tokenizer = AutoTokenizer.from_pretrained(
    f"EleutherAI/{eleuther_model_name}", 
    padding_side="left", 
    padding='longest', 
    truncation='longest_first', max_length=2000)
eleuther_tokenizer.pad_token = eleuther_tokenizer.eos_token

eleuther_model = AutoModelForCausalLM.from_pretrained(
    f"EleutherAI/{eleuther_model_name}")

device = "cuda" if torch.cuda.is_available() else "cpu"
eleuther_model = eleuther_model.to(device)

In [8]:
def _find_generated_answer(tokens, newline="\n" ): 
    """Our LMs tend to insert initial newline characters before
    they begin generating text. This function ensures that we 
    properly capture the true first line as the answer while
    also ensuring that token probabilities are aligned."""        
    answer_token_indices = []
    char_seen = False            
    for i, tok in enumerate(tokens):
        # This is the main condition: a newline that isn't an initial
        # string of newlines:
        if tok == newline and char_seen:
            break
        # Keep the initial newlines for consistency:
        elif tok == newline and not char_seen:
            answer_token_indices.append(i)
        # Proper tokens:
        elif tok != newline:
            char_seen = True
            answer_token_indices.append(i)
    return answer_token_indices 

In [11]:
def run_eleuther(prompts, temperature=0.1, top_p=0.95, max_new_tokens=100, **generate_kwargs): 
    prompt_ids = eleuther_tokenizer(
        prompts, return_tensors="pt", padding=True).input_ids.to(device)
        
    with torch.inference_mode():
        # Automatic mixed precision if possible.
        with torch.cuda.amp.autocast() if torch.cuda.is_available() else nullcontext():
            model_output = eleuther_model.generate(
                prompt_ids,
                temperature=temperature,
                do_sample=True,
                top_p=top_p,           
                max_new_tokens=max_new_tokens,
                num_return_sequences=1,                
                pad_token_id=eleuther_tokenizer.eos_token_id, 
                return_dict_in_generate=True,
                output_scores=True,
                **generate_kwargs)
        
    # Converting output scores using the helpful recipe here:
    # https://discuss.huggingface.co/t/generation-probabilities-how-to-compute-probabilities-of-output-scores-for-gpt2/3175
    gen_ids = model_output.sequences[:, prompt_ids.shape[-1] :]
    gen_probs = torch.stack(model_output.scores, dim=1).softmax(-1)
    gen_probs = torch.gather(gen_probs, 2, gen_ids[:, :, None]).squeeze(-1)
    
    # Generated texts, including the prompts:
    gen_texts = eleuther_tokenizer.batch_decode(
        model_output.sequences, skip_special_tokens=True)
    
    data = []     
    iterator = zip(prompts, gen_ids, gen_texts, gen_probs)    
    for prompt, gen_id, gen_text, gen_prob in iterator:       
        gen_tokens = eleuther_tokenizer.convert_ids_to_tokens(gen_id)
        generated_text = gen_text[len(prompt): ]
        gen_prob = [float(x) for x in gen_prob.cpu().numpy()] # float for JSON storage
        ans_indices = _find_generated_answer(gen_tokens, newline="Ċ")
        answer_tokens = [gen_tokens[i] for i in ans_indices]
        answer_probs = [gen_prob[i] for i in ans_indices]
        answer = "".join(answer_tokens).replace("Ġ", " ").replace("Ċ", "\n")                                       
        data.append({
            "prompt": prompt,
            "generated_text": generated_text,
            "generated_tokens": gen_tokens,
            "generated_probs": gen_prob,
            "generated_answer": answer,
            "generated_answer_probs": answer_probs,
            "generated_answer_tokens": answer_tokens})                        

    return data

In [55]:
prompts = ['''Johannesburg (CNN)He checked the series of stills on his camera. It was then that photographer James Oatway realized the entire attack had taken less than two minutes. It was the morning after a night of unrest in Johannesburg\'s Alexandra Township that saw foreign-owned shops looted and destroyed. Mozambican Emmanuel Sithole was walking down a street when four South Africans surrounded him. Sithole pleaded for mercy, but it was already too late. The attackers bludgeoned him with a wrench, stabbed him with knives, all in broad daylight. And Oatway had captured it all on his camera. "They looked like hardened thugs, just by their intensity, the way they moved, the expressions on their faces," Oatway told CNN. "They wanted one thing and that was to kill Emmanuel. They wanted his blood and nothing was going to stop them from doing that." Oatway says he tried to get as close as possible, conscious that the attackers were aware of his presence. "When the attack started I was 20 meters (65 feet) away, but at one point I was 4 or 5 meters away," he said. "I did think that maybe they would leave him alone." The attackers finally did move on and leave Sithole alone. Oatway and his fellow journalist Beauregard Tromp quickly put the injured man in the back of the car and rushed him to a hospital, where he later died. "I still remember him looking straight into my eyes," said Oatway. "He had a kind of a dazed, shocked look in his face." Oatway\'s series of images of the ordeal landed on the front page of South Africa\'s Sunday Times under the headline, "Kill thy neighbor: Alex attack brings home SA\'s shame." It\'s a shame that South Africa continues to confront. Seven people have been killed in the latest round of xenophobic violence against poorer immigrants, many from South Africa\'s neighbors. Xenophobic attacks: How did we get here? Local media alleged that the attacks were a consequence of Zulu King Goodwill Zwelithini reportedly saying at a recent gathering that foreigners "should pack their bags and go" because they are taking jobs from citizens. Shortly after his reported comments, violence against immigrants erupted in the port city of Durban. But on Monday, Zwelithini said he had not called for a war on immigrants. "This war I am calling for today is to protect everyone of foreign origin in this country irrespective of which country they are from." The United Nations said the attacks actually began in March after a labor dispute between citizens and foreign workers. OPINION: Labeling South Africa violence as xenophobia misses the point. But it was Oatway\'s photos of the violent attack on Sithole that have seemingly encapsulated the true horror of the situation -- and South Africa\'s leaders have begun to take notice. "Terrible picture. People who live in rough townships have never seen such a scene," said President Jacob Zuma about the photos in a televised news conference. "And I was sitting and I was saying to myself, what are we telling the world about ourselves?" Police announced they\'ve now arrested all four suspects -- the last caught overnight Tuesday -- with help from Oatway\'s photos, which is little solace for the photographer who captured a level of depravity rarely seen. "I\'m sickened by it," said Oatway. "And I\'m extremely angry, angry with the men that did this, and ultimately I\'m upset that our efforts weren\'t successful in saving Emmanuel\'s life."

Summary:
''']

run_eleuther(prompts)[0]['generated_answer']

"\nThe South African government has announced that it will investigate the death of a South African man who was stabbed to death by four men in Johannesburg's Alexandra Township. The attack took place on the morning of August 31, and the South African government has said that it will investigate the incident. The South African government has said that it will investigate the incident."

In [67]:
def run_llm(text, prompt, pre, temperature=0.1, top_p=0.95, max_new_tokens=100):
    prompt = get_prompt(text, prompt, pre, end='\n')
    # print(prompt)
    generated = run_eleuther([prompt],                
                temperature=temperature,
                top_p=top_p,           
                max_new_tokens=max_new_tokens)
    return generated[0]['generated_answer'].strip()

In [74]:
run_llm(cnn[1]['text'], REGULAR_SUMMARY_POST, pre=False, max_new_tokens=150)

'The BBCâĢĻs Wolf Hall, starring Damian Lewis, is a six-part drama series that was first broadcast in the U.S. for the first time. The series is set in the Tudor period in which the Tudor dynasty was in power and the period of religious turmoil that followed the death of Henry VIII. The series is set in the fictional town of Wolf Hall, which is based on the town of Wolf Hall in the English county of Northamptonshire. The series is set in the 16th century and follows the lives of the six main characters: Thomas Cromwell, the hero of the series; Sir Thomas More, the Catholic martyr; Thomas CromwellâĢĻs son, Thomas Cromwell; Sir Thomas'

In [63]:
run_llm(cnn[0]['text'], CNN_PROMPT_STYLE_POST, pre=False, 
       max_new_tokens=100)

'\nThe article is about a man who was attacked by four men in Johannesburg. The article is about the man who was attacked by four men in Johannesburg.'

In [66]:
run_llm(cnn[0]['text'], CNN_PROMPT_PRE, pre=True, 
       max_new_tokens=256)


"\nThe article is a bit long, but it's a good read."

In [72]:
run_llm(xsum[1]['text'], REGULAR_SUMMARY_POST, pre=False, max_new_tokens=60)

'The Association of Directors of Adult Social Services (ADASS) has warned that the government\'s plans to cut funding for social care services are "unendurable".'

In [80]:
run_llm(newsroom[0]['text'], 'Summary:', pre=False, max_new_tokens=100)

'The cause of the outbreak was a new strain of influenza A virus, which had been circulating in the community for several years. The virus was first detected in the New York City area in late October. The virus was first identified in the New York City area in late October. The virus was first identified in the New York City area in late October.'

## T0

In [None]:
t0 = pipeline('text-generation', model='bigscience/T0_3B')

## Open AI

In [35]:
# gpt3
ADA = 'text-ada-001' # cheapest
BABBAGE = 'text-babbage-001'
CURIE = 'text-curie-001'

# gpt3.5
DAVINCI = 'text-davinci-003'


@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def run_gpt3(prompts, engine=ADA, temperature=0.1, top_p=0.95, max_tokens=256):
    # add your key here
    openai.api_key = ''
    
    assert engine.startswith("text"), \
        "Please use an engine whose name begins with 'text'."
        
    response = openai.Completion.create(
        engine=engine,       
        prompt=prompts,
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens,
        echo=False)
    

    return [ c.text for c in response["choices"]]
         

In [37]:
def run_openai(text, prompt, pre, engine, temperature=0.1):
    prompt = get_prompt(text, prompt, pre)
    res = run_gpt3([prompt], engine, temperature=temperature)
    return res[0].strip()

In [38]:
run_openai(cnn[0]['text'], CNN_PROMPT_POST, pre=False, engine=CURIE)

"Photographer James Oatway captured the attack on Mozambican Emmanuel Sithole that left him dead in broad daylight. The attackers looked like hardened thugs and wanted his blood, and Oatway tried to get as close as possible before they moved on. Seven people have been killed in the latest round of xenophobic violence against poorer immigrants, many from South Africa's neighbors."

In [39]:
run_openai(cnn[0]['text'], CNN_PROMPT_STYLE_POST, pre=False, engine=CURIE, 
           temperature=0.9)

'James Oatway is a photographer who captured the attack on Emmanuel Sithole in Johannesburg\'s Alexandra Township that left him dead. The attackers, all South Africans, appear to have been motivated by xenophobia and killed Sithole with a wrench, knives, and a blunt object in broad daylight. Oatway\'s photos of the attack landed on the front page of South Africa\'s Sunday Times under the headline, "Kill thy neighbor: Alex attack brings home SA\'s shame." Although the attackers have been arrested, Oatway says he\'s sickened by what he captured and is angry with the men who did this.'

# Running on datasets

In [82]:
def run_on_data(name, dataset, run_func):
    sums = [run_func(doc['text']) for doc in tqdm(dataset)] 
    
    with open(f'results/{name}.json', 'w') as f:
        json.dump(sums, f)

In [46]:
run_on_data('cnn_style_curie', cnn, 
    lambda t: run_openai(t, CNN_PROMPT_STYLE_POST, pre=False, engine=CURIE, temperature=0.9))

100%|██████████| 100/100 [01:44<00:00,  1.04s/it]


In [47]:
run_on_data('xsum_style_curie', xsum, 
    lambda t: run_openai(t, XSUM_PROMPT_STYLE_POST, pre=False, engine=CURIE, temperature=0.9))

100%|██████████| 100/100 [01:06<00:00,  1.50it/s]


In [48]:
run_on_data('newsroom_style_curie', newsroom, 
    lambda t: run_openai(t, NEWSROOM_PROMPT_STYLE_POST, pre=False, engine=CURIE, temperature=0.9))

100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


In [49]:
run_on_data('cnn_curie', cnn, 
    lambda t: run_openai(t, CNN_PROMPT_POST, pre=False, engine=CURIE))

100%|██████████| 100/100 [02:05<00:00,  1.26s/it]


In [50]:
run_on_data('xsum_curie', xsum, 
    lambda t: run_openai(t, XSUM_PROMPT_POST, pre=False, engine=CURIE))

100%|██████████| 100/100 [01:07<00:00,  1.49it/s]


In [58]:
run_on_data('newsroom_curie', newsroom, 
    lambda t: run_openai(t, NEWSROOM_PROMPT_POST, pre=False, engine=CURIE))

100%|██████████| 100/100 [01:48<00:00,  1.08s/it]


In [52]:
run_on_data('cnn_style_davinci', cnn, 
    lambda t: run_openai(t, CNN_PROMPT_STYLE_POST, pre=False, engine=DAVINCI, temperature=0.9))

100%|██████████| 100/100 [04:32<00:00,  2.73s/it]


In [53]:
run_on_data('xsum_style_davinci', xsum, 
    lambda t: run_openai(t, XSUM_PROMPT_STYLE_POST, pre=False, engine=DAVINCI, temperature=0.9))

100%|██████████| 100/100 [02:28<00:00,  1.48s/it]


In [54]:
run_on_data('newsroom_style_davinci', newsroom, 
    lambda t: run_openai(t, NEWSROOM_PROMPT_STYLE_POST, pre=False, engine=DAVINCI, temperature=0.9))

100%|██████████| 100/100 [04:02<00:00,  2.43s/it]


In [55]:
run_on_data('cnn_davinci', cnn, 
    lambda t: run_openai(t, CNN_PROMPT_POST, pre=False, engine=DAVINCI))

100%|██████████| 100/100 [05:02<00:00,  3.02s/it]


In [56]:
run_on_data('xsum_davinci', xsum, 
    lambda t: run_openai(t, XSUM_PROMPT_POST, pre=False, engine=DAVINCI))

100%|██████████| 100/100 [02:22<00:00,  1.42s/it]


In [None]:
run_on_data('newsroom_davinci', newsroom, 
    lambda t: run_openai(t, NEWSROOM_PROMPT_POST, pre=False, engine=DAVINCI))

In [None]:
run_on_data('newsroom_brio', newsroom, run_brio)

In [None]:
run_on_data('xsum_brio', xsum, run_brio)

In [None]:
run_on_data('cnn_brio', cnn, run_brio_cnn)

In [None]:
run_on_data('newsroom_bart', newsroom, 
            lambda t: run_bart(t, max_length=256, min_length=30))

In [None]:
run_on_data('xsum_bart', xsum, 
            lambda t: run_bart(t, max_length=40, min_length=10))

In [None]:
run_on_data('cnn_bart', cnn, 
            lambda t: run_bart(t, max_length=20, min_length=60))

In [83]:
run_on_data('xsum_eleuther1b', xsum, 
            lambda t: run_llm(t, REGULAR_SUMMARY_POST, pre=False, 
                              max_new_tokens=60))

  0%|          | 0/100 [00:00<?, ?it/s]

In [84]:
run_on_data('cnn_eleuther1b', cnn, 
            lambda t: run_llm(t, REGULAR_SUMMARY_POST, pre=False, 
                              max_new_tokens=100))

  0%|          | 0/100 [00:00<?, ?it/s]

In [85]:
run_on_data('newsroom_eleuther1b', newsroom, 
            lambda t: run_llm(t, REGULAR_SUMMARY_POST, pre=False, 
                              max_new_tokens=100))

  0%|          | 0/100 [00:00<?, ?it/s]