In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

In [3]:
hugginface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(hugginface_dataset_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [5]:
dataset['train'][5]['dialogue']

"#Person1#: Happy birthday, Aims!\n#Person2#: Thank you, Lisa.\n#Person1#: Here is a present for you. I hope you like it.\n#Person2#: Oh, great! I love it! You know I've been expecting this for a long time.\n#Person1#: I'm very glad to hear that.\n#Person2#: Come here ; let me introduce some friends to you."

In [6]:
print(dataset['train'][5]['dialogue'])

#Person1#: Happy birthday, Aims!
#Person2#: Thank you, Lisa.
#Person1#: Here is a present for you. I hope you like it.
#Person2#: Oh, great! I love it! You know I've been expecting this for a long time.
#Person1#: I'm very glad to hear that.
#Person2#: Come here ; let me introduce some friends to you.


In [7]:
print(f"Human summary: {dataset['train'][5]['summary']}")

Human summary: Lisa gives Aims a birthday present and Aims loves it.


In [8]:
model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [10]:
sentence ="What time is it, Tom?"

sentence_encoded = tokenizer(sentence, return_tensors='pt')

print(sentence_encoded)

{'input_ids': tensor([[ 363,   97,   19,   34,    6, 3059,   58,    1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


In [11]:
sentence_decoded = tokenizer.decode(
    sentence_encoded["input_ids"][0],
    skip_special_tokens=True
)

print(sentence_decoded)

What time is it, Tom?


In [12]:
print("Encoded:")
print(sentence_encoded["input_ids"][0])
print("---------------------------------------------------")
print("Decoded:")
print(sentence_decoded)

Encoded:
tensor([ 363,   97,   19,   34,    6, 3059,   58,    1])
---------------------------------------------------
Decoded:
What time is it, Tom?


# sample code for ZERO SHOT

In [13]:
 model_name = 'facebook/mbart-large-50-many-to-many-mmt'
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
 tokenizer.tgt_lang = "fa_IR"

 prompt = f"""
  Summarize the following Persian text in Persian:
آموزش رانندگی فرآیندی مرحله‌به‌مرحله است که با آشنایی با قوانین راهنمایی و رانندگی و تابلوهای راه آغاز می‌شود و سپس به یادگیری مهارت‌های عملی نظیر کنترل فرمان، استفاده صحیح از پدال‌ها، رعایت فاصله ایمن، استفاده از آینه‌ها، دنده‌کشی و پارک کردن در شرایط مختلف می‌پردازد؛ در این مسیر، هنرجو باید توانایی تشخیص موقعیت‌های خطرناک، تصمیم‌گیری سریع و رعایت ادب و احترام در رانندگی را کسب کند و با تمرین مستمر تحت نظر مربی مجرب، آمادگی لازم برای شرکت در آزمون‌های آیین‌نامه و عملی را پیدا کند تا در نهایت بتواند به‌عنوان یک راننده مسئول، ایمن و قانون‌مدار در جاده‌ها تردد کند.


 """

model.config.forced_bos_token_id = tokenizer.lang_code_to_id["fa_IR"]

inputs = tokenizer(prompt, return_tensors = 'pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0],
    skip_special_tokens=True
)

print(f'MODEL GENERATION - ZERO SHOT:{output}')


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]



MODEL GENERATION - ZERO SHOT:در این مسیر ، هنرجو باید توانایی تشخیص موقعیت های خطرناک ، تصمیم گیری سریع و رعایت ادب و احترام در رانندگی را کسب کند و با تمرین مستمر تحت نظر مربی مجرب ، آمادگی لازم برای شرکت در آزمون های آیین نامه و عملی را


# sample code for ZERO SHOT

In [18]:
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

example_indices = [20,40]

for i, index in enumerate(example_indices):
  dialogue = dataset['test'][index]['dialogue']
  summary = dataset['test'][index]['summary']

  # Sample Prompt
  prompt = f"""

Summurize the following conversation

  {dialogue}

Summary:

  """

  inputs = tokenizer(prompt, return_tensors='pt')

  output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0],
    skip_special_tokens=True
  )

  print("---------------------------------------------")
  print('Example', i + 1)
  print(f'INPUT PROMPT: {prompt}')
  print("---------------------------------------------")
  print(f'Human Base Summray: {summary}')
  print("---------------------------------------------")
  print(f'*** Model Generation - Zero - Shot: {output}')


---------------------------------------------
Example 1
INPUT PROMPT: 

Summurize the following conversation

  #Person1#: What's wrong with you? Why are you scratching so much?
#Person2#: I feel itchy! I can't stand it anymore! I think I may be coming down with something. I feel lightheaded and weak.
#Person1#: Let me have a look. Whoa! Get away from me!
#Person2#: What's wrong?
#Person1#: I think you have chicken pox! You are contagious! Get away! Don't breathe on me!
#Person2#: Maybe it's just a rash or an allergy! We can't be sure until I see a doctor.
#Person1#: Well in the meantime you are a biohazard! I didn't get it when I was a kid and I've heard that you can even die if you get it as an adult!
#Person2#: Are you serious? You always blow things out of proportion. In any case, I think I'll go take an oatmeal bath.

Summary:

  
---------------------------------------------
Human Base Summray: #Person1# thinks #Person2# has chicken pox and warns #Person2# about the possible haza

# sample code for ZERO SHOT

In [41]:
for i, index in enumerate(example_indices):
  dialogue = dataset['test'][index]['dialogue']
  summary = dataset['test'][index]['summary']

# Sample Prompt
  prompt = f"""

Dialogue :

  {dialogue}

What was going on?

  """
  inputs = tokenizer(prompt, return_tensors='pt')

  output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0],
    skip_special_tokens=True
  )

  print("---------------------------------------------")
  print('Example', i + 1)
  print(f'INPUT PROMPT: {prompt}')
  print("---------------------------------------------")
  print(f'Human Base Summray: {summary}')
  print("---------------------------------------------")
  print(f'*** Model Generation - Zero - Shot: {output}')

---------------------------------------------
Example 1
INPUT PROMPT: 

Dialogue :

  #Person1#: What's wrong with you? Why are you scratching so much?
#Person2#: I feel itchy! I can't stand it anymore! I think I may be coming down with something. I feel lightheaded and weak.
#Person1#: Let me have a look. Whoa! Get away from me!
#Person2#: What's wrong?
#Person1#: I think you have chicken pox! You are contagious! Get away! Don't breathe on me!
#Person2#: Maybe it's just a rash or an allergy! We can't be sure until I see a doctor.
#Person1#: Well in the meantime you are a biohazard! I didn't get it when I was a kid and I've heard that you can even die if you get it as an adult!
#Person2#: Are you serious? You always blow things out of proportion. In any case, I think I'll go take an oatmeal bath.

What was going on?

  
---------------------------------------------
Human Base Summray: #Person1# thinks #Person2# has chicken pox and warns #Person2# about the possible hazards but #Person2

# Defind Prompt Function

In [42]:
def sample_prompt(sample_indices, sample_index_to_summarize):
  prompt = ''
  for index in sample_indices:
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']

    prompt = f"""

    Dialogue :

      {dialogue}

    What was going on?
    {summary}
      """
  dialogue = dataset['test'][sample_index_to_summarize]['dialogue']

  prompt += f"""

  Dialogue :

    {dialogue}

  What was going on?

    """

  return prompt


# sample code for ONE SHOT

In [43]:
sample_indices=[22]
sample_index_to_summarize = 200

one_shot_prompt = sample_prompt(sample_indices, sample_index_to_summarize)

print(one_shot_prompt)



    Dialogue :

      #Person1#: Good coming. What can I do for you?
#Person2#: I'm in Room 309. I'm checking out today. Can I have my bill now?
#Person1#: Certainly. Please wait a moment. Here you are.
#Person2#: Thanks. Wait... What's this? The 30 dollar for?
#Person1#: Excuse me... The charge for your laundry service on Nov. 20th.
#Person2#: But I did't take any laundry service during my stay here. I think you have added someone else's.
#Person1#: Ummmm...Sorry, would you mind waiting a moment? We check it with the department concerned.
#Person2#: No. As long as we get this straightened out.
#Person1#: I'm very sorry. There has been a mistake. We'll correct the bill. Please take a look.
#Person2#: Okay, here you are.
#Person1#: Goodbye.

    What was going on?
    #Person1# helps #Person2# correct a mischarged bill on laundry service and helps #Person2# check out.
      

  Dialogue :

    #Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what 

In [34]:
sample_indices=[22]
sample_index_to_summarize = 200

summary = dataset['test'][sample_index_to_summarize]['summary']

one_shot_prompt = sample_prompt(sample_indices, sample_index_to_summarize)

inputs = tokenizer(one_shot_prompt, return_tensors='pt')

output = tokenizer.decode(
  model.generate(
      inputs["input_ids"],
      max_new_tokens=50,
  )[0],
  skip_special_tokens=True
)


print("---------------------------------------------")
print(f'Human Base Summray: {summary}')
print("---------------------------------------------")
print(f'*** Model Generation - ONE - Shot: {output}')

---------------------------------------------
Human Base Summray: #Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------
*** Model Generation - ONE - Shot: #Person1 wants to upgrade his computer. #Person2 wants to upgrade his hardware.


# Sample code for FEW SHOT

In [35]:
sample_indices=[200, 60 ,150]
sample_index_to_summarize = 200

few_shot_prompt = sample_prompt(sample_indices, sample_index_to_summarize)

print(few_shot_prompt)



    Dialogue :

      #Person1#: Taxi!
#Person2#: Where will you go, sir?
#Person1#: Friendship Hotel.
#Person2#: OK, it's not far from here.
#Person1#: I have something important to do, can you fast the speed?
#Person2#: Sure, I'll try my best. Here we are.
#Person1#: It's fast! How much should I pay you?
#Person2#: The reading on the meter is 15 yuan.
#Person1#: Here's 20 yuan, keep the change.
#Person2#: Thank you very much.

    What was going on?
    #Person1# takes a taxi to the Friendship Hotel for something important.
      

  Dialogue :

    #Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: Y

In [36]:
sample_indices=[200, 60,150]
sample_index_to_summarize = 200

summary = dataset['test'][sample_index_to_summarize]['summary']

few_shot_prompt = sample_prompt(sample_indices, sample_index_to_summarize)

inputs = tokenizer(few_shot_prompt, return_tensors='pt')

output = tokenizer.decode(
  model.generate(
      inputs["input_ids"],
      max_new_tokens=50,
  )[0],
  skip_special_tokens=True
)


print("---------------------------------------------")
print(f'Human Base Summray: {summary}')
print("---------------------------------------------")
print(f'*** Model Generation - FEW - Shot: {output}')

---------------------------------------------
Human Base Summray: #Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------
*** Model Generation - FEW - Shot: #Person1 wants to upgrade his system. #Person2 wants to add a painting program to his software. #Person1 wants to add a CD-ROM drive.
