In [None]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install -U \
    transformers==4.27.2 \
    datasets --quiet

In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

In [None]:
#huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset("knkarthick/dialogsum")

In [None]:
example_indices = [10, 102]

dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(example_indices):
    print(dash_line)
    print('Exemplo ', i + 1)
    print(dash_line)
    print('DIÁLOGO DE ENTRADA:')
    print(dataset['test'][index]['dialogue'])
    print(dash_line)
    print('RESUMO HUMANO:')
    print(dataset['test'][index]['summary'])
    print(dash_line)
    print()

In [None]:
model_name='google/flan-t5-base'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

2

In [None]:
sentence = "A Ana Cunha desenha melhor do que o Palla!"

sentence_encoded = tokenizer(sentence, return_tensors='pt')

sentence_decoded = tokenizer.decode(
        sentence_encoded["input_ids"][0], 
        skip_special_tokens=True
    )

print('SENTENÇA CODIFICADA:')
print(sentence_encoded["input_ids"][0])
print('\nSENTENÇA DECODIFICADA:')
print(sentence_decoded)

# Exemplo 1 - Sem nenhuma engenharia de prompt.

In [None]:
for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']
    
    inputs = tokenizer(dialogue, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"], 
            max_new_tokens=50,
        )[0], 
        skip_special_tokens=True
    )
    
    print(dash_line)
    print('Exemplo ', i + 1)
    print(dash_line)
    print(f'PROMPT DE ENTRADA:\n{dialogue}')
    print(dash_line)
    print(f'RESUMO HUMANO:\n{summary}')
    print(dash_line)
    print(f'GERADO PELO MODELO - SEM ENGENHARIA DE PROMPT:\n{output}\n')

# Exemplo 2 - Engenharia de prompt zero shot com uma instrução.

In [None]:
for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']

    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
    """

    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"], 
            max_new_tokens=50,
        )[0], 
        skip_special_tokens=True
    )
    
    print(dash_line)
    print('Exemplo ', i + 1)
    print(dash_line)
    print(f'PROMPT DE ENTRADA:\n{prompt}')
    print(dash_line)
    print(f'RESUMO HUMANO:\n{summary}')
    print(dash_line)    
    print(f'GERADO PELO MODELO - ZERO SHOT:\n{output}\n')

# Exemplo 3 - Inferencia Zero Shot com modelo de prompt do FLAN-T5 ( Template de prompt)
https://github.com/google-research/FLAN/blob/main/flan/v2/templates.py

In [None]:
for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']
        
    prompt = f"""
Dialogue:

{dialogue}

What was going on?
"""

    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"], 
            max_new_tokens=50,
        )[0], 
        skip_special_tokens=True
    )

    print(dash_line)
    print('Exemplo ', i + 1)
    print(dash_line)
    print(f'PROMPT DE ENTRADA:\n{prompt}')
    print(dash_line)
    print(f'RESUMO HUMANO:\n{summary}\n')
    print(dash_line)
    print(f'GERADO PELO MODELO - ZERO SHOT:\n{output}\n')

# Exemplo com one shot

## Definindo o prompt

In [None]:
def make_prompt(example_indices_full, example_index_to_summarize):
    prompt = ''
    for index in example_indices_full:
        dialogue = dataset['test'][index]['dialogue']
        summary = dataset['test'][index]['summary']
        
        prompt += f"""
Dialogue:

{dialogue}

What was going on?
{summary}


"""
    
    dialogue = dataset['test'][example_index_to_summarize]['dialogue']
    
    prompt += f"""
Dialogue:

{dialogue}

What was going on?
"""
        
    return prompt

## Constriuindo o prompt

In [None]:
example_indices_full = [10]
example_index_to_summarize = 102

one_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(one_shot_prompt)

## Utilizando o prompt

In [None]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(one_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0], 
    skip_special_tokens=True
)

print(dash_line)
print(f'RESUMO HUMANO:\n{summary}\n')
print(dash_line)
print(f'GERADO PELO MODELO - ONE SHOT:\n{output}')

# Exemplo few shots

## Construindo o prompt

In [None]:
example_indices_full = [10, 80, 120]
example_index_to_summarize = 102

few_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(few_shot_prompt)

In [None]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0], 
    skip_special_tokens=True
)

print(dash_line)
print(f'RESUMO HUMANO:\n{summary}\n')
print(dash_line)
print(f'GERADO PELO MODELO - FEW SHOT:\n{output}')

# Ajustando parâmetros

In [None]:
# generation_config = GenerationConfig(max_new_tokens=50)
# generation_config = GenerationConfig(max_new_tokens=10)
# generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=0.1)
generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=0.5)
# generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=1.0)
# generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=4.0)

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config=generation_config,
    )[0], 
    skip_special_tokens=True
)

print(dash_line)
print(f'GERADO PELO MODELO - FEW SHOT:\n{output}')
print(dash_line)
print(f'RESUMO HUMANO:\n{summary}\n')