In [17]:
# %pip install --upgrade pip
# %pip install --disable-pip-version-check \
#     torch \
#     torchdata --quiet

# %pip install \
#     transformers \
#     datasets  --quiet

In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_name = 'knkarthick/dialogsum'
data = load_dataset(dataset_name)

In [3]:
example_indices = [40,200]
dash_line = '-'.join(['' for i in range(100)])

In [4]:
for i, index in enumerate(example_indices):
    print(dash_line)
    print('Example: ', i + 1)
    print(dash_line)
    print('Input Dialog: ')
    print(data['test'][index]['dialogue'])
    print(dash_line)
    print('Human Summary: ')
    print(data['test'][index]['summary'])

---------------------------------------------------------------------------------------------------
Example:  1
---------------------------------------------------------------------------------------------------
Input Dialog: 
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
---------------------------------------------------------------------------------------------------
Human Summary: 
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------
Example:  2
---------------------------------------------------------------------------------------------------


In [5]:
model_name = 'google/flan-t5-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = True)

In [7]:
## test tokenizer
sent = 'I love Machine Learing'
sent_encode = tokenizer(sent, return_tensors='pt')
sent_decode = tokenizer.decode(sent_encode['input_ids'][0], 
                               skip_special_tokens=True)
print('Sentence Encoding: ')
print(sent_encode['input_ids'][0])
print('Sentence Decoding: ')
print(sent_decode)

Sentence Encoding: 
tensor([  27,  333, 5879,  312,    9, 1007,    1])
Sentence Decoding: 
I love Machine Learing


#### without promt engg

In [8]:
for i, index in enumerate(example_indices):
    dialog = data['test'][index]['dialogue']
    summary = data['test'][index]['summary']
    input =  tokenizer(dialog, return_tensors='pt')
    output = tokenizer.decode(model.generate(input['input_ids'],
                                             max_new_tokens = 50)[0],
                              skip_special_tokens=True)
    print(dash_line)
    print('Example: ', i + 1)
    print(dash_line)
    print('dialogue: ')
    print(dialog)
    print(dash_line)
    print('Baseline Summary: ')
    print(summary)
    print(dash_line)
    print('Model Summary: ')
    print(output)

---------------------------------------------------------------------------------------------------
Example:  1
---------------------------------------------------------------------------------------------------
dialogue: 
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
---------------------------------------------------------------------------------------------------
Baseline Summary: 
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------
Model Summary: 
Person1: It's ten to nine.
----------------------------------------------------------------------

#### prompt engg - zero shot inferencing

In [9]:
def prompt_fun(example_indices_to_pass, example_indices_to_summarize, input = 'dialogue:', task = 'whats was going on?'):
    prompt = ''
    for index in example_indices_to_pass:
        dialogue = data['test'][index]['dialogue']
        summary = data['test'][index]['summary']
        prompt += f""" 
                    {input} 
                    {dialogue}
                    {task}
                    {summary}
                  """
    dialogue = data['test'][example_indices_to_summarize]['dialogue']
    prompt += f""" 
                {input} 
                {dialogue}
                {task}
              """
    return prompt

In [10]:
for i, index in enumerate(example_indices):
    prompt = prompt_fun([], index, input = 'summarize the following conversations :', task = 'summary :')
    input =  tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(model.generate(input['input_ids'],
                                             max_new_tokens = 50)[0],
                              skip_special_tokens=True)
    print(dash_line)
    print('Example: ', i + 1)
    print(dash_line)
    print('dialogue: ')
    print(data['test'][index]['dialogue'])
    print(dash_line)
    print('Baseline Summary: ')
    print(data['test'][index]['summary'])
    print(dash_line)
    print('Model Summary: ')
    print(output)

---------------------------------------------------------------------------------------------------
Example:  1
---------------------------------------------------------------------------------------------------
dialogue: 
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
---------------------------------------------------------------------------------------------------
Baseline Summary: 
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------
Model Summary: 
The train is about to leave.
--------------------------------------------------------------------

In [11]:
for i, index in enumerate(example_indices):
    prompt = prompt_fun([], index)
    input =  tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(model.generate(input['input_ids'],
                                             max_new_tokens = 50)[0],
                              skip_special_tokens=True)
    print(dash_line)
    print('Example: ', i + 1)
    print(dash_line)
    print('dialogue: ')
    print(data['test'][index]['dialogue'])
    print(dash_line)
    print('Baseline Summary: ')
    print(data['test'][index]['summary'])
    print(dash_line)
    print('Model Summary: ')
    print(output)

---------------------------------------------------------------------------------------------------
Example:  1
---------------------------------------------------------------------------------------------------
dialogue: 
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
---------------------------------------------------------------------------------------------------
Baseline Summary: 
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------
Model Summary: 
Tom is late for work. He has to catch the nine-thirty train.
------------------------------------

#### prompt engg - one shot inferencing

In [12]:
example_indices_to_pass = [40]
example_indices_to_summarize = 200

prompt = prompt_fun(example_indices_to_pass, example_indices_to_summarize)
input =  tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(model.generate(input['input_ids'],
                                         max_new_tokens = 50)[0],
                          skip_special_tokens=True)
print(dash_line)
print('dialogue: ')
print(data['test'][example_indices_to_summarize]['dialogue'])
print(dash_line)
print('Baseline Summary: ')
print(data['test'][example_indices_to_summarize]['summary'])
print(dash_line)
print('Model Summary: ')
print(output)

---------------------------------------------------------------------------------------------------
dialogue: 
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.
---------------------------------------------------------------------------------------------------
Baseline Summary: 
#P

#### prompt engg - Few shot inferencing

In [13]:
example_indices_to_pass = [18, 40, 180] ## generally 3-5 are best to check max efficiency of model
example_indices_to_summarize = 200

prompt = prompt_fun(example_indices_to_pass, example_indices_to_summarize)
input =  tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(model.generate(input['input_ids'],
                                         max_new_tokens = 50)[0],
                          skip_special_tokens=True)
print(dash_line)
print('dialogue: ')
print(data['test'][example_indices_to_summarize]['dialogue'])
print(dash_line)
print('Baseline Summary: ')
print(data['test'][example_indices_to_summarize]['summary'])
print(dash_line)
print('Model Summary: ')
print(output)

Token indices sequence length is longer than the specified maximum sequence length for this model (850 > 512). Running this sequence through the model will result in indexing errors


---------------------------------------------------------------------------------------------------
dialogue: 
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.
---------------------------------------------------------------------------------------------------
Baseline Summary: 
#P

#### generative configuration parameters inferencing

In [14]:
generation_config = GenerationConfig(new_max_tokens = 50, 
                                     do_sample = True, #default = false
                                     temperature = 0.5, #default = 1.0
                                     top_k = 50, #default = 50
                                     top_p = 1.0)#default = 1.0

In [15]:
example_indices_to_pass = [40]
example_indices_to_summarize = 200

prompt = prompt_fun(example_indices_to_pass, example_indices_to_summarize)
input =  tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(model.generate(input['input_ids'],
                                         max_new_tokens = 50, 
                                         generation_config = generation_config)[0],
                          skip_special_tokens=True)
print(dash_line)
print('dialogue: ')
print(data['test'][example_indices_to_summarize]['dialogue'])
print(dash_line)
print('Baseline Summary: ')
print(data['test'][example_indices_to_summarize]['summary'])
print(dash_line)
print('Model Summary: ')
print(output)

---------------------------------------------------------------------------------------------------
dialogue: 
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.
---------------------------------------------------------------------------------------------------
Baseline Summary: 
#P