In [59]:
import numpy as np
import nltk
import os
import torch

import evaluate
from nltk.tokenize import sent_tokenize
from torch.optim import AdamW
from tqdm.auto import tqdm
from accelerate import Accelerator
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    get_scheduler,
    pipeline
)
from datasets import load_dataset, concatenate_datasets, DatasetDict
from torch.utils.data import DataLoader

# Dataset

In [2]:
dataset_checkpoint = "amazon_reviews_multi"
dataset_commit_id = "f256e74ee2353b7c7854f86f86200f220531caa4"

In [3]:
spanish_dataset = load_dataset(dataset_checkpoint, revision=dataset_commit_id, name="es")
english_dataset = load_dataset(dataset_checkpoint, revision=dataset_commit_id, name="en")

In [4]:
english_dataset

DatasetDict({
    train: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
})

In [5]:
spanish_dataset

DatasetDict({
    train: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
})

## Explore

In [6]:
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n>> Title: {example['review_title']}")
        print(f">> Review: '{example['review_body']}'")

In [7]:
show_samples(english_dataset)


>> Title: Worked in front position, not rear
>> Review: '3 stars because these are not rear brakes as stated in the item description. At least the mount adapter only worked on the front fork of the bike that I got it for.'

>> Title: meh
>> Review: 'Does it’s job and it’s gorgeous but mine is falling apart, I had to basically put it together again with hot glue'

>> Title: Can't beat these for the money
>> Review: 'Bought this for handling miscellaneous aircraft parts and hanger "stuff" that I needed to organize; it really fit the bill. The unit arrived quickly, was well packaged and arrived intact (always a good sign). There are five wall mounts-- three on the top and two on the bottom. I wanted to mount it on the wall, so all I had to do was to remove the top two layers of plastic drawers, as well as the bottom corner drawers, place it when I wanted and mark it; I then used some of the new plastic screw in wall anchors (the 50 pound variety) and it easily mounted to the wall. Some h

In [8]:
show_samples(spanish_dataset)


>> Title: .
>> Review: 'La montarlo se rompió una rueda debido a materiales débiles, pero al arreglarla funciona correctamente.'

>> Title: Primeras impresiones
>> Review: 'El servicio ha sido muy bueno, me ha llegado 2 días antes de lo previsto. En cuanto al producto no es que me haya dado muy buenas primeras impresiones. El borde del protector es de plástico y lo único que hay de cristal es la pantalla. Además el plástico es muy fino. A nivel estético queda muy bien y se ajusta perfectamente, la única queja que tengo es eso, que no sea todo de cristal y que para mi gusto es demasiado fino. De la resistencia no tengo ni idea ya que es el primer día que lo llevo. No creo que sea mal producto del todo si no que depende del gusto y el cuidado que tenga cada uno de su móvil. Personalmente creo que por el mismo precio hay otros productos que si que son enteros de cristal y más gordos que por lo menos a mí me generan más confianza.'

>> Title: .
>> Review: 'Funciona genial y la llevo conmi

In [9]:
english_dataset.set_format("pandas")
english_df = english_dataset["train"][:]

In [10]:
english_df["product_category"].value_counts()

product_category
home                        17679
apparel                     15951
wireless                    15717
other                       13418
beauty                      12091
drugstore                   11730
kitchen                     10382
toy                          8745
sports                       8277
automotive                   7506
lawn_and_garden              7327
home_improvement             7136
pet_products                 7082
digital_ebook_purchase       6749
pc                           6401
electronics                  6186
office_product               5521
shoes                        5197
grocery                      4730
book                         3756
baby_product                 3150
furniture                    2984
jewelry                      2747
camera                       2139
industrial_supplies          1994
digital_video_download       1364
luggage                      1328
musical_instruments          1102
video_games                   7

## Prepare

In [11]:
english_dataset.reset_format()

In [12]:
def filter_books(example):
    return (
        example["product_category"] == "book"
        or example["product_category"] == "digital_ebook_purchase"
    )

In [13]:
english_books = english_dataset.filter(filter_books)
spanish_books = spanish_dataset.filter(filter_books)

In [14]:
show_samples(english_books)


>> Title: I'm dissapointed.
>> Review: 'I guess I had higher expectations for this book from the reviews. I really thought I'd at least like it. The plot idea was great. I loved Ash but, it just didnt go anywhere. Most of the book was about their radio show and talking to callers. I wanted the author to dig deeper so we could really get to know the characters. All we know about Grace is that she is attractive looking, Latino and is kind of a brat. I'm dissapointed.'

>> Title: Good art, good price, poor design
>> Review: 'I had gotten the DC Vintage calendar the past two years, but it was on backorder forever this year and I saw they had shrunk the dimensions for no good reason. This one has good art choices but the design has the fold going through the picture, so it's less aesthetically pleasing, especially if you want to keep a picture to hang. For the price, a good calendar'

>> Title: Helpful
>> Review: 'Nearly all the tips useful and. I consider myself an intermediate to advance

In [15]:
books_dataset = DatasetDict()

for split in english_books.keys():
    books_dataset[split] = concatenate_datasets(
        [english_books[split], spanish_books[split]]
    )
    books_dataset[split] = books_dataset[split].shuffle(seed=42)

In [16]:
show_samples(books_dataset)


>> Title: Easy to follow!!!!
>> Review: 'I loved The dash diet weight loss Solution. Never hungry. I would recommend this diet. Also the menus are well rounded. Try it. Has lots of the information need thanks.'

>> Title: PARCIALMENTE DAÑADO
>> Review: 'Me llegó el día que tocaba, junto a otros libros que pedí, pero la caja llegó en mal estado lo cual dañó las esquinas de los libros porque venían sin protección (forro).'

>> Title: no lo he podido descargar
>> Review: 'igual que el anterior'


## TODO: make word count distribution plots

## Filter low-wordcount review titles

In [17]:
# Use white space heuristic for "word count"
books_dataset = books_dataset.filter(lambda x: len(x["review_title"].split()) > 2)

## Tokenization

In [18]:
model_checkpoint = "google/mt5-small"
model_commit_id = "38f23af8ec210eb6c376d40e9c56bd25a80f195d"

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, revision_id=model_commit_id)

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [20]:
inputs = tokenizer("I think Catch 22 is the best book ever.")

In [21]:
inputs

{'input_ids': [336, 5231, 259, 139068, 1024, 339, 287, 1920, 3435, 14049, 260, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [22]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)

['▁I',
 '▁think',
 '▁',
 'Catch',
 '▁22',
 '▁is',
 '▁the',
 '▁best',
 '▁book',
 '▁ever',
 '.',
 '</s>']

In [23]:
max_input_length = 512
max_target_length = 30

In [24]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["review_body"],
        max_length=max_input_length,
        truncation=True
    )
    labels = tokenizer(
        examples["review_title"],
        max_length=max_target_length,
        truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [25]:
tokenized_datasets = books_dataset.map(preprocess_function, batched=True)

# Evaluation metrics

In [26]:
generated_summary = "I absolutely loved reading Catch 22"
reference_summary = "I loved reading Catch 22"

In [27]:
rouge_score = evaluate.load("rouge")

In [28]:
scores = rouge_score.compute(
    predictions=[generated_summary],
    references=[reference_summary],
)

In [29]:
scores

{'rouge1': 0.9090909090909091,
 'rouge2': 0.6666666666666665,
 'rougeL': 0.9090909090909091,
 'rougeLsum': 0.9090909090909091}

## Baseline

In [30]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/carcook/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [31]:
def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])


def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["review_body"]]
    return metric.compute(predictions=summaries, references=dataset["review_title"])

In [32]:
three_sentence_summary(books_dataset["train"][1]["review_body"])

'I grew up reading Koontz, and years ago, I stopped,convinced i had "outgrown" him.\nStill,when a friend was looking for something suspenseful too read, I suggested Koontz.\nShe found Strangers.'

In [33]:
score = evaluate_baseline(books_dataset["validation"], rouge_score)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict(
    (rn, round(score[rn] * 100, 2)) for rn in rouge_names
)

In [34]:
rouge_dict

{'rouge1': 16.76, 'rouge2': 8.9, 'rougeL': 15.52, 'rougeLsum': 15.97}

# Fine-tuning

In [35]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, revision=model_commit_id)

In [36]:
batch_size = 8
num_train_epochs = 8
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]
output_dir = f"../temp/07/{model_name}-finetuned-amazon-en-es"

In [37]:
args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=False
)

In [38]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100s, which can't be decoded
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Prep for ROUGE
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )
    # Extract the median scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [39]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [40]:
tokenized_datasets = tokenized_datasets.remove_columns(
    books_dataset["train"].column_names
)

In [41]:
features = [tokenized_datasets["train"][i] for i in range(2)]

In [42]:
data_collator(features)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[   653,   1957,   1314,    261,   2757,   1280,    435,    259,  29166,
            263,    269,    774,   5547,      1,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              

## Subsample to ensure manageable computation time

In [43]:
n_train_samp = int(0.4 * tokenized_datasets["train"].num_rows)
n_eval_samp = int(0.4 * tokenized_datasets["validation"].num_rows)

In [44]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(n_train_samp)),
    eval_dataset=tokenized_datasets["validation"].shuffle(seed=42).select(range(n_eval_samp)),
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [45]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,3.668645,7.1205,1.691,7.0171,6.9591
2,No log,3.386395,14.6202,6.7335,13.9003,13.8646
3,6.821700,3.35786,15.3182,6.3923,14.2015,14.1578
4,6.821700,3.328821,15.8385,7.9066,15.2506,15.3713
5,3.887500,3.307375,14.7763,6.6983,14.5092,14.4131
6,3.887500,3.291033,17.3872,9.1905,16.8081,16.8042
7,3.887500,3.281902,17.0531,8.69,16.4796,16.6126
8,3.629200,3.285086,17.3198,8.9727,16.8522,16.8705


TrainOutput(global_step=3872, training_loss=4.701760552146218, metrics={'train_runtime': 1234.0482, 'train_samples_per_second': 25.075, 'train_steps_per_second': 3.138, 'total_flos': 4861981464207360.0, 'train_loss': 4.701760552146218, 'epoch': 8.0})

In [46]:
trainer.evaluate()

{'eval_loss': 3.285085678100586,
 'eval_rouge1': 17.3198,
 'eval_rouge2': 8.9727,
 'eval_rougeL': 16.8522,
 'eval_rougeLsum': 16.8705,
 'eval_runtime': 4.8239,
 'eval_samples_per_second': 19.694,
 'eval_steps_per_second': 2.488,
 'epoch': 8.0}

# Train and evaluate model with Accelerate & custom loop

## DataLoaders and model

In [47]:
tokenized_datasets.set_format("torch")

In [48]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
batch_size = 8

In [49]:
train_dataloader = DataLoader(
    tokenized_datasets["train"].shuffle(seed=42).select(range(n_train_samp)),
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"].shuffle(seed=42).select(range(n_eval_samp)),
    collate_fn=data_collator,
    batch_size=batch_size
)

## Other components

In [50]:
optimizer = AdamW(model.parameters(), lr=2e-5)
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [51]:
num_train_epochs = 10
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

In [52]:
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [53]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expect newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

## Training loop

In [54]:
output_dir_accel = f"{output_dir}-accelerate"

In [55]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Train
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    
    # Evaluate
    model.eval()
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )

            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )
            labels = batch["labels"]

            labels = accelerator.pad_across_processes(
                batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
            )

            generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
            labels = accelerator.gather(labels).cpu().numpy()

            # Replace -100 in the labels as we can't decode them
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]
            decoded_preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            decoded_preds, decoded_labels = postprocess_text(
                decoded_preds, decoded_labels
            )

            rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)

    result = rouge_score.compute()
    result = {key: value * 100 for key, value in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}
    print(f"Epoch {epoch}:", result)

    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir_accel, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir_accel)

  0%|          | 0/4840 [00:00<?, ?it/s]



Epoch 0: {'rouge1': 0.7866, 'rouge2': 0.3828, 'rougeL': 0.7866, 'rougeLsum': 0.7866}
Epoch 1: {'rouge1': 1.1077, 'rouge2': 0.0, 'rougeL': 1.0424, 'rougeLsum': 1.0577}
Epoch 2: {'rouge1': 1.4812, 'rouge2': 0.4921, 'rougeL': 1.4719, 'rougeLsum': 1.4709}
Epoch 3: {'rouge1': 1.9814, 'rouge2': 0.4721, 'rougeL': 1.9208, 'rougeLsum': 1.9258}
Epoch 4: {'rouge1': 2.1683, 'rouge2': 0.1914, 'rougeL': 1.9338, 'rougeLsum': 1.9285}
Epoch 5: {'rouge1': 3.6431, 'rouge2': 0.8038, 'rougeL': 3.4985, 'rougeLsum': 3.4505}
Epoch 6: {'rouge1': 3.8504, 'rouge2': 0.6325, 'rougeL': 3.6319, 'rougeLsum': 3.5949}
Epoch 7: {'rouge1': 4.3953, 'rouge2': 0.8038, 'rougeL': 4.336, 'rougeLsum': 4.3013}
Epoch 8: {'rouge1': 4.5617, 'rouge2': 0.9467, 'rougeL': 4.5015, 'rougeLsum': 4.4963}
Epoch 9: {'rouge1': 4.4103, 'rouge2': 0.9467, 'rougeL': 4.2996, 'rougeLsum': 4.2964}


# Predictions with fine-tuned model

In [65]:
last_model = sorted(os.listdir(output_dir))[-1]

In [69]:
summarizer = pipeline("summarization", model=f"{output_dir}/{last_model}")

In [74]:
def print_summary(idx):
    review = books_dataset["test"][idx]["review_body"]
    title = books_dataset["test"][idx]["review_title"]
    summary = summarizer(books_dataset["test"][idx]["review_body"])[0]["summary_text"]
    print(f">>> Title: '{title}'")
    print(f"\n>>> Summary: '{summary}'")
    print(f"\n>>> Review: '{review}'")

In [78]:
print_summary(45)

>>> Title: 'Opinion on fangirl'

>>> Summary: 'It was a whirlwind'

>>> Review: 'This novel took my heart. If you’ve read Eleanor and Park you know your in for a whirlwind. It was everything I expected.'
