In [2]:
## Objective:

# Este notebook tem o objetivo de rodar um algoritmo que realiza o resumo de um texto que lhe é apresentado.
# Exemplo 1: 
# Texto (entrada): Hoje o dia esta ensolarado e é necessarios, dado que o calor pode resercar as plantas, molha-las.
# Resumo (output): O dia está ensolarado, é necessário molhas as plantas.
# Exemplo 2: 
# Texto (entrada): Ontem tivemos que sair mais tarde, porque havia muito trabalho a ser feito no escritório. Com isso, levando em consideracao que pegamos transito no caminho de volta para casa, 
# chegamos após o horário do jantar.
# Resumo (output): Ontem, devido ao volume de trabalho, chegamos após o horário do jantar.

In [3]:
# imports
import os
instance_type_expected = 'ml-m5-2xlarge'
instance_type_current = os.environ.get('HOSTNAME')

print('expected instance:', instance_type_expected)
print('current instance:', instance_type_current)

assert instance_type_expected in instance_type_current, f'Error: you selected the wrong instance, please select the correct before start'
print('Instance type has been choose correctly')

expected instance: ml-m5-2xlarge
current instance: instance-datascience-ml-m5-2xlarge
Instance type has been choose correctly


In [4]:
# Installing librarys
! pip install -U datasets==2.17.0

! pip install --upgrade pip
! pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

!pip install \
    transformers==4.27.2 --quiet

[0m

In [5]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

In [6]:
# 1. Summarizer model

In [7]:
hugingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(hugingface_dataset_name)

In [8]:
# Print some examples of dialogues in the dataset

In [9]:
for i in [0, 50]:
    print('-'*100)
    print('example', i)
    print('input dialogue')
    print(dataset['train']['dialogue'][i])
    print('baseline human summary')
    print(dataset['train']['summary'][i])

----------------------------------------------------------------------------------------------------
example 0
input dialogue
#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?
#Person2#: I found it would be a good idea to get a check-up.
#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.
#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?
#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.
#Person2#: Ok.
#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?
#Person2#: Yes.
#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.
#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.
#Person1#: Well, we have classes and some medications that might help. I'll give you more in

In [10]:
# Load the model and create a instance. The model is a FLAN-T5
model_name = 'google/flan-t5-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [11]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [13]:
# Test the tokenizer encode/decode
test_phrase = dataset['train']['dialogue'][0]

sentence_encoded = tokenizer(test_phrase, return_tensors='pt')
sentence_decoded = tokenizer.decode(sentence_encoded['input_ids'][0], skep_special_tokens=True)

print('Encoded sentence', sentence_encoded['input_ids'][0])
print('Decoded sentence', sentence_decoded)


Encoded sentence tensor([ 1713,   345, 13515,   536,  4663,    10,  2018,     6,  1363,     5,
         3931,     5,    27,    31,    51,  7582, 12833,    77,     7,     5,
         1615,    33,    25,   270,   469,    58,  1713,   345, 13515,   357,
         4663,    10,    27,   435,    34,   133,    36,     3,     9,   207,
          800,    12,   129,     3,     9,   691,    18,   413,     5,  1713,
          345, 13515,   536,  4663,    10,  2163,     6,   168,     6,    25,
           43,    29,    31,    17,   141,    80,    21,   305,   203,     5,
          148,   225,    43,    80,   334,   215,     5,  1713,   345, 13515,
          357,  4663,    10,    27,   214,     5,    27,  2320,    38,   307,
           38,   132,    19,  1327,  1786,     6,   572,   281,   217,     8,
         2472,    58,  1713,   345, 13515,   536,  4663,    10,  1548,     6,
            8,   200,   194,    12,  1792,  2261, 21154,    19,    12,   253,
           91,    81,   135,   778,     5,   26

In [16]:
# Vamos agora testar como o modelo desempenho uma tarefa de previsao sem prompt_enginnering.
phrase = dataset['test'][0]['dialogue']
summary = dataset['test'][0]['summary']

inputs = tokenizer(phrase, return_tensors='pt')    # Returning pythorch tensors
outputs = tokenizer.decode(
    model.generate(
        inputs['input_ids'], max_new_tokens=50
    )[0],
    skip_special_tokens=True
    )

print('Example:')
print(phrase)
print('Inputs:')
print(summary)
print('Outputs:')
print(outputs)

Example:
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to change their communication methods. I don't want any - one using Instant Messaging in this office. It wastes too much time! Now, please continue with the

In [None]:
# Parece que o resumo realizado pelo modelo não vai de encontro com o dialogo.

In [19]:
# 2. Summarize Dialogue with the prompt enginnering

In [20]:
# 2.2 Zero Shot Learning

In [22]:
# Agora vamos passar para o modelo uma instrução pelo prompt para verificar o seu retorno, apos ser recebido o prompt
phrase = dataset['test'][0]['dialogue']
summary = dataset['test'][0]['summary']

prompt = f"""Summarize the follow conversation.

        {phrase}
        
        Summary:
"""

# Vamos agora testar como o modelo desempenho uma tarefa de previsao sem prompt_enginnering.
phrase = dataset['test'][0]['dialogue']
summary = dataset['test'][0]['summary']

inputs = tokenizer(prompt, return_tensors='pt')    # Returning pythorch tensors
outputs = tokenizer.decode(
    model.generate(
        inputs['input_ids'], max_new_tokens=50
    )[0],
    skip_special_tokens=True
    )

print('Example:')
print(phrase)
print('Summary:')
print(summary)
print('Outputs:')
print(outputs)

Example:
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to change their communication methods. I don't want any - one using Instant Messaging in this office. It wastes too much time! Now, please continue with the

In [23]:
# Este resumo esta muito melhor do que o ultimo!

In [24]:
# Prompt Engennering with the FLAN-T5 template

In [25]:
# FLAN-T5- modelo has a lot of prompt enginnering templates [here](https://github.com/google-research/FLAN/tree/main/flan/v2) that can be pass to the model

In [31]:
# Agora vamos passar para o modelo uma instrução pelo prompt para verificar o seu retorno, apos ser recebido o prompt
phrase = dataset['test'][0]['dialogue']
summary = dataset['test'][0]['summary']

# Prompt possibility 1:
prompt = f"Dialogue:\n{phrase}\nWhat was going on"
# Prompt possibility 2:
# pronpt = f"Here is a dialogue:\n{phrase}\n\nWrite a short summary!"

# Vamos agora testar como o modelo desempenho uma tarefa de previsao sem prompt_enginnering.
phrase = dataset['test'][0]['dialogue']
summary = dataset['test'][0]['summary']

inputs = tokenizer(prompt, return_tensors='pt')    # Returning pythorch tensors
outputs = tokenizer.decode(
    model.generate(
        inputs['input_ids'], max_new_tokens=50
    )[0],
    skip_special_tokens=True
    )

print('Example:')
print(phrase)
print('Summary:')
print(summary)
print('Outputs:')
print(outputs)

Example:
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to change their communication methods. I don't want any - one using Instant Messaging in this office. It wastes too much time! Now, please continue with the

In [32]:
# Prompt Engenering with one-shot and few-show inferences.

In [None]:
# One-shot inference

In [36]:
# Agora vamos passar para o modelo uma instrução pelo prompt para verificar o seu retorno, apos ser recebido o prompt
base_phrase = dataset['test'][1]['dialogue']
base_summary = dataset['test'][1]['summary']
phrase = dataset['test'][0]['dialogue']
summary = dataset['test'][0]['summary']

prompt = f"""
Dialogue:

{base_phrase}

What was going on?
{base_summary}


"""
    
prompt += f"""
Dialogue:

{phrase}

What was going on?
"""

# Vamos agora testar como o modelo desempenho uma tarefa de previsao sem prompt_enginnering.
phrase = dataset['test'][0]['dialogue']
summary = dataset['test'][0]['summary']

inputs = tokenizer(prompt, return_tensors='pt')    # Returning pythorch tensors
outputs = tokenizer.decode(
    model.generate(
        inputs['input_ids'], max_new_tokens=50
    )[0],
    skip_special_tokens=True
    )

print('Example:')
print(phrase)
print('Summary:')
print(summary)
print('Outputs:')
print(outputs)

Example:
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to change their communication methods. I don't want any - one using Instant Messaging in this office. It wastes too much time! Now, please continue with the

In [40]:
# Few-shot inference

# Agora vamos passar para o modelo uma instrução pelo prompt para verificar o seu retorno, apos ser recebido o prompt
base_phrase = dataset['test'][1]['dialogue']
base_summary = dataset['test'][1]['summary']
base_phrase_1 = dataset['test'][2]['dialogue']
base_summary_1 = dataset['test'][2]['summary']
phrase = dataset['test'][0]['dialogue']
summary = dataset['test'][0]['summary']

prompt = f"""
Dialogue:

{base_phrase}

What was going on?
{base_summary}


"""
prompt += f"""
Dialogue:

{base_phrase_1}

What was going on?
{base_summary_1}


"""
prompt += f"""
Dialogue:

{phrase}

What was going on?
"""

# Vamos agora testar como o modelo desempenho uma tarefa de previsao sem prompt_enginnering.
phrase = dataset['test'][0]['dialogue']
summary = dataset['test'][0]['summary']

inputs = tokenizer(prompt, return_tensors='pt')    # Returning pythorch tensors
outputs = tokenizer.decode(
    model.generate(
        inputs['input_ids'], max_new_tokens=50
    )[0],
    skip_special_tokens=True
    )

print('Example:')
print(phrase)
print('Inputs:')
print(summary)
print('Outputs:')
print(outputs)

Example:
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to change their communication methods. I don't want any - one using Instant Messaging in this office. It wastes too much time! Now, please continue with the