# Workshop 1 - Summarization 

In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig

In [2]:
model_name = 'sshleifer/distilbart-cnn-12-6'

In [3]:
# TODO: Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [6]:
text = """ 
Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth;

Then took the other, as just as fair,
And having perhaps the better claim,
Because it was grassy and wanted wear;
Though as for that the passing there
Had worn them really about the same,

And both that morning equally lay
In leaves no step had trodden black.
Oh, I kept the first for another day!
Yet knowing how way leads on to way,
I doubted if I should ever come back.

I shall be telling this with a sigh
Somewhere ages and ages hence:
Two roads diverged in a wood, and I—
I took the one less traveled by,
And that has made all the difference.
"""

In [7]:
# TODO: Create a prompt
prompt = f""" 
Summarize this article.
{text}

Summary:
"""

In [8]:
# TODO: tokenize the text
prompt_enc = tokenizer(prompt, return_tensors='pt')

print(prompt_enc)


{'input_ids': tensor([[    0,  1437, 50118, 38182,  3916,  2072,    42,  1566,     4, 50118,
          1437, 50118,  9058,  3197, 13105,  4462,    11,    10,  5718,  5627,
             6, 50118,  2409,  6661,    38,   115,    45,  1504,   258, 50118,
          2409,    28,    65, 33489,     6,   251,    38,  3359, 50118,  2409,
          1415,   159,    65,    25,   444,    25,    38,   115, 50118,  3972,
           147,    24, 18822,    11,     5,   223, 14596,   131, 50118, 50118,
         12948,   362,     5,    97,     6,    25,    95,    25,  2105,     6,
         50118,  2409,   519,  2532,     5,   357,  2026,     6, 50118, 10105,
            24,    21,  6964,   219,     8,   770,  3568,   131, 50118, 26223,
            25,    13,    14,     5,  3133,    89, 50118, 32054, 10610,   106,
           269,    59,     5,   276,     6, 50118, 50118,  2409,   258,    14,
           662,  6681,  4477, 50118,  1121,  3607,   117,  1149,    56, 14168,
         39893,   909,     4, 50118,  

In [17]:
# TODO: Generate summary with model 

config = GenerationConfig(min_length=5, max_length=10)

answer_enc = model.generate(prompt_enc['input_ids'], min_new_tokens=5, max_new_tokens=20)

print(answer_enc)

tensor([[    2,     0,  1596,  3197, 13105,  4462,    11,    10,  5627,     6,
             8,    38,   578,   100,   578,   362,     5,    65,   540,  8468,
             2]])


In [18]:
# Decode the answer
answer = tokenizer.decode(answer_enc[0], skip_special_tokens=True)

print(answer)

 Two roads diverged in a wood, and I—I— took the one less traveled


## T5 Models

The <code>flan-t5</code> is a Text-To-Text Transfer Transformer (T5) that is capable of performing zero-shot NLP task such as summary, simple reasoninig, answering questions, etc. 

Some T5 models from Huggingface
- [<code>google/flan-t5-base</code>](https://huggingface.co/google/flan-t5-base)
- [<code>google/flan-t5-small</code>](https://huggingface.co/google/flan-t5-small)
- [<code>google/flan-t5-xl</code>](https://huggingface.co/google/flan-t5-xl)
- [<code>google/flan-t5-xxl</code>](https://huggingface.co/google/flan-t5-xxl) - full model

Complete list of [T5 models](https://huggingface.co/models?search=google/flan) on Huggingface.

In [19]:
# TODO Perform summarization with google/flan-t5-base model
t5_model = "google/flan-t5-base"

tokenizer_t5 = AutoTokenizer.from_pretrained(t5_model)
model_t5 = AutoModelForSeq2SeqLM.from_pretrained(t5_model)



In [20]:
prompt_t5_enc = tokenizer_t5(prompt, return_tensors='pt')
print(prompt_t5_enc)

{'input_ids': tensor([[12198,  1635,  1737,    48,  1108,     5,  2759,  7540, 12355,  5402,
            16,     3,     9,  4459,  1679,     6,   275,  8032,    27,   228,
            59,  1111,   321,   275,    36,    80,  1111,    49,     6,   307,
            27,  8190,   275,  2299,   323,    80,    38,   623,    38,    27,
           228,   304,   213,    34, 21222,    16,     8,   365, 24690,   117,
            37,    29,   808,     8,   119,     6,    38,   131,    38,  2725,
             6,   275,   578,  2361,     8,   394,  1988,     6,  2070,    34,
            47,  5956,    63,    11,  1114,  2112,   117,  4229,    38,    21,
            24,     8,  5792,   132, 10118,  8842,   135,   310,    81,     8,
           337,     6,   275,   321,    24,  1379,  7509,  8260,    86,  3231,
           150,  1147,   141, 10968,    26,   537,  1001,     5,  3359,     6,
            27,  2697,     8,   166,    21,   430,   239,    55,  5201,  4265,
           149,   194,  3433,    30,  

In [21]:
answer_t5_enc = model_t5.generate(prompt_t5_enc['input_ids'], min_new_tokens=5, max_new_tokens=20)
print(answer_t5_enc)

tensor([[    0,    37,    29,    27,  8190,     6,    11,  2299,   323,    80,
            38,   623,    38,    27,   228,     6,   304,   213,    34, 21222,
            16]])


In [22]:
answer_t5 = tokenizer_t5.decode(answer_t5_enc[0], skip_special_tokens=True)

print(answer_t5)

Then I stood, and looked down one as far as I could, To where it bent in
