In [1]:
# Generative AI

In [2]:
! pip install datasets
! pip install transformers


Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [3]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig
from os.path import join

In [4]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)
## here we are loading the Dataset

Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [5]:
example_indices = [40,200]


for i, index in enumerate(example_indices):
  print('Example', i+1)
  print('INPUT DIALOGUE:')
  print(dataset['test'][index]['dialogue'])
  print('BASELINE HUMAN SUMMARY')
  print(dataset['test'][index]['summary'])
  print()
## Dataset

Example 1
INPUT DIALOGUE:
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
BASELINE HUMAN SUMMARY
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.

Example 2
INPUT DIALOGUE:
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, 

In [6]:
model_name = 'google/flan-t5-base'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
## Loading the Flan-t5 Model

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
## Using tokenizer

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [8]:
sentence = "What time is it, Tom?"
sentence_encoded = tokenizer(sentence, return_tensors= 'pt')
sentence_decoded = tokenizer.decode(
    sentence_encoded["input_ids"][0],
    skip_special_tokens = True
)
print('ENCODED SENTENCE:')
print(sentence_encoded["input_ids"][0])
print('\nDECODED SENTENCE:')
print(sentence_decoded)
## Using the Tokenizer

ENCODED SENTENCE:
tensor([ 363,   97,   19,   34,    6, 3059,   58,    1])

DECODED SENTENCE:
What time is it, Tom?


In [9]:
for i, index in enumerate(example_indices):
  dialogue = dataset['test'][index]['dialogue']
  summary = dataset['test'][index]['summary']

  prompt = f"""
Summarize the above conversation.
{dialogue}

Summary:
    """
    #input construcetd prompt instead of dialogue
  inputs = tokenizer(prompt, return_tensors = 'pt')
  output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens = True
  )

  print('Example ', i+1)
  print(f'INPUT PROMPT:\n{prompt}')
  print(f'BASELINE HUMAN SUMMARY:\n{summary}')
  print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n')

Example  1
INPUT PROMPT:

Summarize the above conversation.
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

Summary:
    
BASELINE HUMAN SUMMARY:
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
MODEL GENERATION - ZERO SHOT:
The train is about to leave.

Example  2
INPUT PROMPT:

Summarize the above conversation.
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want

In [10]:
for i, index in enumerate(example_indices):
  dialogue = dataset['test'][index]['dialogue']
  summary = dataset['test'][index]['summary']

  prompt = f"""
Dialogue :
{dialogue}

What was going on?
    """
    #input construcetd prompt instead of dialogue
  inputs = tokenizer(prompt, return_tensors = 'pt')
  output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens = True
  )

  print('Example ', i+1)
  print(f'INPUT PROMPT:\n{prompt}')
  print(f'BASELINE HUMAN SUMMARY:\n{summary}')
  print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n')

Example  1
INPUT PROMPT:

Dialogue :
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

What was going on?
    
BASELINE HUMAN SUMMARY:
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
MODEL GENERATION - ZERO SHOT:
Tom is late for the train.

Example  2
INPUT PROMPT:

Dialogue :
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it i

In [11]:
# One shot and few shot inference

In [21]:
def make_prompt(example_indices_full, example_index_to_summarize):
  prompt = ''
  for index in example_indices_full:
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']

    prompt += f"""
Dialogue:
{dialogue}

What was going on?
{summary}
"""
  dialogue = dataset['test'][example_index_to_summarize]['dialogue']
  prompt +=f"""
Dialogue:
{dialogue}
What was going on?
"""
  return prompt

In [22]:
example_indices_full = [40]
example_index_to_summarize = 200

one_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)
print(one_shot_prompt)


Dialogue:
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

What was going on?
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.

Dialogue:
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a

In [23]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(one_shot_prompt, return_tensors = 'pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens =50,
    )[0],
    skip_special_tokens = True
)

print(f"BASELINE HUMAN SUMMARY:\n{summary}\n")
print(f"MODEL GENERATION-ONE SHOT:\n{output}\n")

BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

MODEL GENERATION-ONE SHOT:
#Person1 wants to upgrade his system. #Person2 wants to add a painting program to his software. #Person1 wants to add a CD-ROM drive.



In [24]:
#Few-Shot Inference

In [25]:
example_indices_full = [40, 80, 120]
example_index_to_summarize = 200

few_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)
print(few_shot_prompt)


Dialogue:
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

What was going on?
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.

Dialogue:
#Person1#: May, do you mind helping me prepare for the picnic?
#Person2#: Sure. Have you checked the weather report?
#Person1#: Yes. It says it will be sunny all day. No sign of rain at all. This is your father's favorite sausage. Sandwiches for you and Daniel.
#Person2#: No, thanks Mom. I'd like some toast and chicken wings.
#Person1#: Okay. Please take some fruit salad and crackers for me.
#Person2#: Done. Oh, don't forget to take napkins disposable plates, cups and picnic blanket.
#Person1#: All set. May,

In [26]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(few_shot_prompt, return_tensors = 'pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens =50,
    )[0],
    skip_special_tokens = True
)

print(f"BASELINE HUMAN SUMMARY:\n{summary}\n")
print(f"MODEL GENERATION-FEW SHOT:\n{output}\n")

Token indices sequence length is longer than the specified maximum sequence length for this model (819 > 512). Running this sequence through the model will result in indexing errors


BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

MODEL GENERATION-ONE SHOT:
#Person1 wants to upgrade his system. #Person2 wants to add a painting program to his software. #Person1 wants to upgrade his hardware.



In [27]:
# As we can see here there is not much difference between outputs of One shot and Few shot inference.

In [29]:
# For using Generation configurations -
generation_config = GenerationConfig(max_new_tokens = 50, do_sample = True, temperature = 0.1)

summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(few_shot_prompt, return_tensors = 'pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config = generation_config,
    )[0],
    skip_special_tokens = True
)

print(f"BASELINE HUMAN SUMMARY:\n{summary}\n")
print(f"MODEL GENERATION-ONE SHOT:\n{output}\n")

BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

MODEL GENERATION-ONE SHOT:
#Person1 wants to upgrade his computer. #Person2 wants to add a painting program to his software. #Person1 wants to upgrade his hardware.

