# Instruction fine-tune the model with the Amazon Customer Reviews Dataset and a set of prompts

![Pipeline](./img/generative_ai_pipeline_rlhf_plus.png)

In [32]:
import psutil

notebook_memory = psutil.virtual_memory()
print(notebook_memory)

if notebook_memory.total < 32 * 1000 * 1000 * 1000:
    print('*******************************************')    
    print('YOU ARE NOT USING THE CORRECT INSTANCE TYPE')
    print('PLEASE CHANGE INSTANCE TYPE TO  m5.2xlarge ')
    print('*******************************************')
else:
    correct_instance_type=True

svmem(total=33229979648, available=15679565824, percent=52.8, used=17133387776, free=2664263680, active=18511855616, inactive=10585133056, buffers=2768896, cached=13429559296, shared=1064960, slab=875925504)


In [33]:
%store -r pretrained_model_checkpoint

In [34]:
try:
    pretrained_model_checkpoint
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] Please run the notebooks in the PREPARE section before you continue.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

In [35]:
print(pretrained_model_checkpoint)

t5-base


In [36]:
%store -r dataset_templates_name

In [37]:
try:
    dataset_templates_name
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] Please run the notebooks in the PREPARE section before you continue.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

In [38]:
print(dataset_templates_name)

amazon_us_reviews/Wireless_v1_00


In [39]:
%store -r prompt_template_name

In [40]:
try:
    prompt_template_name
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] Please run the notebooks in the PREPARE section before you continue.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

In [41]:
print(prompt_template_name)

Generate review headline based on review body


In [63]:
import pandas as pd
import csv
file = './data-tsv/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz'

# Read the file
df = pd.read_csv(file, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")

df.isna().values.any()
df = df.dropna()
df = df.reset_index(drop=True)    

print("Shape of dataframe {}".format(df.shape))

# Convert Pandas dataframes into Datasets
import datasets
from datasets import Dataset

# Create Dataset objects (Arrow PyTables) from Pandas dataframes
dataset = Dataset.from_pandas(df)

# Apply prompt    
from promptsource.templates import DatasetTemplates
prompt_templates = DatasetTemplates(dataset_templates_name) 

for template in prompt_templates.templates.values():
    print(template.get_name())

prompt = prompt_templates[prompt_template_name]
print(prompt.answer_choices)    
print(prompt.__dict__)

dataset = dataset.filter(lambda row: len(row['review_headline']) > 50) \
                              .select(range(1000)) \
                              .map(lambda row : {'prompt': prompt.apply(row)[0], 'label': prompt.apply(row)[1]})
print("Shape of dataset {}".format(dataset.shape))

Shape of dataframe (145427, 15)
Generate review headline based on review body
Generate review based on rating and category
Given the review headline return a categorical rating
Generate review headline based on rating
Given the review body return a categorical rating
None
{'answer_choices': None, 'id': '5feaa0d7-e4e0-46cc-8517-e00bfa7fd00e', 'jinja': 'Give a short sentence describing the following product review:\n{{review_body}} \n|||\n{{review_headline}}', 'metadata': <promptsource.templates.Template.Metadata object at 0x7f0781fb9b90>, 'name': 'Generate review headline based on review body', 'reference': 'Generate review headline based on review body'}


  0%|          | 0/146 [00:00<?, ?ba/s]

  0%|          | 0/1000 [00:00<?, ?ex/s]

Shape of dataset (1000, 17)


In [64]:
prompt = dataset[0]['prompt']
print(prompt)

Give a short sentence describing the following product review:
As has been written by so many others, I quickly lost interest in this game. I am still playing Civ 4 and love it. It's a shame because I'm ready for an expanded version of Civ 4 and have waited for about a decade for a better version of it. Civ 5 was not an evolution but a total rewrite and it lost all that was good in Civ 4. I really hope that when Civ 6 comes out they use Civ 4 as the starting point and forget Civ 5 ever happened. Failing that there is a place in the market for a strategy game that involves building a civilisation.


In [65]:
label = dataset[0]['label']
print(label)

I am still playing Civ 4 and love it. It's a shame because I'm ready for ...


# Perform zero-shot inference BEFORE fine-tuning

To tokenize all our texts with the same vocabulary that was used when training the model, we have to download a pretrained tokenizer. This is all done by the `AutoTokenizer` class:

In [66]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Instruction fine-tune the model with the Amazon Customer Reviews Data

In [67]:
from datasets import Dataset

dataset_train = Dataset.from_parquet('./data/train/*.parquet')
print(dataset_train.shape)

Using custom data configuration default-18914fab36d418e5
Found cached dataset parquet (/root/.cache/huggingface/datasets/parquet/default-18914fab36d418e5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


(2944, 3)


In [68]:
from transformers import TrainingArguments
import torch

training_args = TrainingArguments(
    output_dir="{}-finetuned-amazon-customer-reviews".format(model_checkpoint.replace("/", "-")),
    learning_rate=1e-5,
    weight_decay=0.01, 
    max_steps=1000,
    num_train_epochs=1.0,
#    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
#    per_device_eval_batch_size=4,
    no_cuda=not torch.cuda.is_available(),
)

We pass along all of those to the `Trainer` class:

In [69]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train
)

In [None]:
train_results = trainer.train()
train_results



Step,Training Loss
500,0.202
1000,0.0905


TrainOutput(global_step=1000, training_loss=0.14627676773071288, metrics={'train_runtime': 3571.7115, 'train_samples_per_second': 1.12, 'train_steps_per_second': 0.28, 'total_flos': 684757352448000.0, 'train_loss': 0.14627676773071288, 'epoch': 1.36})

# Save fine-tuned model

In [None]:
supervised_fine_tuned_model_checkpoint = './tmp_models/{}-supervised-fine-tuned/'.format(model_checkpoint)

model.save_pretrained(supervised_fine_tuned_model_checkpoint)
tokenizer.save_pretrained(supervised_fine_tuned_model_checkpoint)

('./tmp_models/google/flan-t5-base-supervised-fine-tuned/tokenizer_config.json',
 './tmp_models/google/flan-t5-base-supervised-fine-tuned/special_tokens_map.json',
 './tmp_models/google/flan-t5-base-supervised-fine-tuned/tokenizer.json')

In [None]:
%store supervised_fine_tuned_model_checkpoint

Stored 'supervised_fine_tuned_model_checkpoint' (str)


In [35]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(supervised_fine_tuned_model_checkpoint, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(supervised_fine_tuned_model_checkpoint)

# Qualitative 

In [36]:
prompts_and_labels = dataset \
    .filter(lambda row: len(row['review_headline']) > 50) \
    .select(range(100)) \
    .map(lambda row : {'prompt': prompt.apply(row)[0], 'label': prompt.apply(row)[1]})

  0%|          | 0/100 [00:00<?, ?ex/s]

In [37]:
prompts = prompts_and_labels['prompt']
human_baseline_summaries = prompts_and_labels['review_headline']

In [38]:
# for prompt_label in filtered_prompts_and_labels:
#     prompt = prompt_label['prompt']
#     label = prompt_label['label']
#     inputs = tokenizer(prompt, return_tensors='pt')

#     response = tokenizer.decode(model.generate(input_ids=inputs["input_ids"], 
#                        max_new_tokens=200,
#                        do_sample=True, 
#                        top_k=50, 
#                        top_p=0.9
#                       )[0],
#                      skip_special_tokens=True)

#     print('PROMPT: {}'.format(prompt))
#     print('RESPONSE: {}'.format(response))
#     print('EXPECTED RESPONSE: {}'.format(label))
#     print('----')

# Quantitative Results with ROGUE Metric

The [ROUGE metric](https://en.wikipedia.org/wiki/ROUGE_(metric)) helps quantify the validity of summarizations produced by models. It compares summarizations to a "baseline" summary which is usually created by a human. While not perfect, it does give an indication to the overall increase in summarization effectiveness that we have accomplished by fine-tuning.

In [45]:
import evaluate

rouge = evaluate.load('rouge')

In [46]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

supervised_fine_tuned_model_tokenizer = AutoTokenizer.from_pretrained(supervised_fine_tuned_model_checkpoint, use_fast=True)
supervised_fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained(supervised_fine_tuned_model_checkpoint)

In [47]:
from transformers import GenerationConfig

supervised_fine_tuned_model_summaries = []

for idx, prompt in enumerate(prompts):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    
    supervised_fine_tuned_model_outputs = supervised_fine_tuned_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    supervised_fine_tuned_model_text_output = supervised_fine_tuned_model_tokenizer.decode(supervised_fine_tuned_model_outputs[0], skip_special_tokens=True)
    supervised_fine_tuned_model_summaries.append(supervised_fine_tuned_model_text_output)    

In [49]:
supervised_fine_tuned_model_results = rouge.compute(
    predictions=supervised_fine_tuned_model_summaries,
    references=human_baseline_summaries[0:len(supervised_fine_tuned_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
supervised_fine_tuned_model_results

{'rouge1': 0.09743180818304786,
 'rouge2': 0.06786948217970731,
 'rougeL': 0.09604160480425282,
 'rougeLsum': 0.09828317265802661}

In [26]:
# import transformers
# from transformers import AutoModelForSeq2SeqLM
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained(supervised_fine_tuned_model_checkpoint, use_fast=True)
# reloaded_model = AutoModelForSeq2SeqLM.from_pretrained(supervised_fine_tuned_model_checkpoint)

In [27]:
# inputs = tokenizer(zero_shot_prompt, return_tensors='pt')

# response = tokenizer.decode(reloaded_model.generate(input_ids=inputs["input_ids"], 
#                    max_new_tokens=200,
#                    do_sample=True, 
#                    top_k=50, 
#                    top_p=0.9
#                   )[0],
#                  skip_special_tokens=True)

# print('PROMPT: {}'.format(prompt))
# print('RESPONSE: {}'.format(response))
# print('EXPECTED RESPONSE: {}'.format(['label']))
# print('----')