In [23]:
from transformers import AutoTokenizer
from datasets import load_dataset, concatenate_datasets, DatasetDict

# Dataset

In [2]:
dataset_checkpoint = "amazon_reviews_multi"
dataset_commit_id = "f256e74ee2353b7c7854f86f86200f220531caa4"

In [4]:
spanish_dataset = load_dataset(dataset_checkpoint, revision=dataset_commit_id, name="es")
english_dataset = load_dataset(dataset_checkpoint, revision=dataset_commit_id, name="en")

Downloading builder script:   0%|          | 0.00/7.11k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/72.2M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.81M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/200000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/78.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.97M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/200000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [5]:
english_dataset

DatasetDict({
    train: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
})

In [6]:
spanish_dataset

DatasetDict({
    train: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
})

## Explore

In [9]:
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n>> Title: {example['review_title']}")
        print(f">> Review: '{example['review_body']}'")

In [10]:
show_samples(english_dataset)


>> Title: Worked in front position, not rear
>> Review: '3 stars because these are not rear brakes as stated in the item description. At least the mount adapter only worked on the front fork of the bike that I got it for.'

>> Title: meh
>> Review: 'Does it’s job and it’s gorgeous but mine is falling apart, I had to basically put it together again with hot glue'

>> Title: Can't beat these for the money
>> Review: 'Bought this for handling miscellaneous aircraft parts and hanger "stuff" that I needed to organize; it really fit the bill. The unit arrived quickly, was well packaged and arrived intact (always a good sign). There are five wall mounts-- three on the top and two on the bottom. I wanted to mount it on the wall, so all I had to do was to remove the top two layers of plastic drawers, as well as the bottom corner drawers, place it when I wanted and mark it; I then used some of the new plastic screw in wall anchors (the 50 pound variety) and it easily mounted to the wall. Some h

In [11]:
show_samples(spanish_dataset)


>> Title: .
>> Review: 'La montarlo se rompió una rueda debido a materiales débiles, pero al arreglarla funciona correctamente.'

>> Title: Primeras impresiones
>> Review: 'El servicio ha sido muy bueno, me ha llegado 2 días antes de lo previsto. En cuanto al producto no es que me haya dado muy buenas primeras impresiones. El borde del protector es de plástico y lo único que hay de cristal es la pantalla. Además el plástico es muy fino. A nivel estético queda muy bien y se ajusta perfectamente, la única queja que tengo es eso, que no sea todo de cristal y que para mi gusto es demasiado fino. De la resistencia no tengo ni idea ya que es el primer día que lo llevo. No creo que sea mal producto del todo si no que depende del gusto y el cuidado que tenga cada uno de su móvil. Personalmente creo que por el mismo precio hay otros productos que si que son enteros de cristal y más gordos que por lo menos a mí me generan más confianza.'

>> Title: .
>> Review: 'Funciona genial y la llevo conmi

In [12]:
english_dataset.set_format("pandas")
english_df = english_dataset["train"][:]

In [13]:
english_df["product_category"].value_counts()

product_category
home                        17679
apparel                     15951
wireless                    15717
other                       13418
beauty                      12091
drugstore                   11730
kitchen                     10382
toy                          8745
sports                       8277
automotive                   7506
lawn_and_garden              7327
home_improvement             7136
pet_products                 7082
digital_ebook_purchase       6749
pc                           6401
electronics                  6186
office_product               5521
shoes                        5197
grocery                      4730
book                         3756
baby_product                 3150
furniture                    2984
jewelry                      2747
camera                       2139
industrial_supplies          1994
digital_video_download       1364
luggage                      1328
musical_instruments          1102
video_games                   7

## Prepare

In [14]:
english_dataset.reset_format()

In [15]:
def filter_books(example):
    return (
        example["product_category"] == "book"
        or example["product_category"] == "digital_ebook_purchase"
    )

In [16]:
english_books = english_dataset.filter(filter_books)
spanish_books = spanish_dataset.filter(filter_books)

Filter:   0%|          | 0/200000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [17]:
show_samples(english_books)


>> Title: I'm dissapointed.
>> Review: 'I guess I had higher expectations for this book from the reviews. I really thought I'd at least like it. The plot idea was great. I loved Ash but, it just didnt go anywhere. Most of the book was about their radio show and talking to callers. I wanted the author to dig deeper so we could really get to know the characters. All we know about Grace is that she is attractive looking, Latino and is kind of a brat. I'm dissapointed.'

>> Title: Good art, good price, poor design
>> Review: 'I had gotten the DC Vintage calendar the past two years, but it was on backorder forever this year and I saw they had shrunk the dimensions for no good reason. This one has good art choices but the design has the fold going through the picture, so it's less aesthetically pleasing, especially if you want to keep a picture to hang. For the price, a good calendar'

>> Title: Helpful
>> Review: 'Nearly all the tips useful and. I consider myself an intermediate to advance

In [19]:
books_dataset = DatasetDict()

for split in english_books.keys():
    books_dataset[split] = concatenate_datasets(
        [english_books[split], spanish_books[split]]
    )
    books_dataset[split] = books_dataset[split].shuffle(seed=42)

In [20]:
show_samples(books_dataset)


>> Title: Easy to follow!!!!
>> Review: 'I loved The dash diet weight loss Solution. Never hungry. I would recommend this diet. Also the menus are well rounded. Try it. Has lots of the information need thanks.'

>> Title: PARCIALMENTE DAÑADO
>> Review: 'Me llegó el día que tocaba, junto a otros libros que pedí, pero la caja llegó en mal estado lo cual dañó las esquinas de los libros porque venían sin protección (forro).'

>> Title: no lo he podido descargar
>> Review: 'igual que el anterior'


## TODO: make word count distribution plots

## Filter low-wordcount review titles

In [22]:
# Use white space heuristic for "word count"
books_dataset = books_dataset.filter(lambda x: len(x["review_title"].split()) > 2)

Filter:   0%|          | 0/9672 [00:00<?, ? examples/s]

Filter:   0%|          | 0/238 [00:00<?, ? examples/s]

Filter:   0%|          | 0/245 [00:00<?, ? examples/s]

## Tokenization

In [24]:
model_checkpoint = "google/mt5-small"
model_commit_id = "38f23af8ec210eb6c376d40e9c56bd25a80f195d"

In [25]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, revision_id=model_commit_id)

Downloading (…)okenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [26]:
inputs = tokenizer("I think Catch 22 is the best book ever.")

In [27]:
inputs

{'input_ids': [336, 5231, 259, 139068, 1024, 339, 287, 1920, 3435, 14049, 260, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [30]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)

['▁I',
 '▁think',
 '▁',
 'Catch',
 '▁22',
 '▁is',
 '▁the',
 '▁best',
 '▁book',
 '▁ever',
 '.',
 '</s>']

In [31]:
max_input_length = 512
max_target_length = 30

In [32]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["review_body"],
        max_length=max_input_length,
        truncation=True
    )
    labels = tokenizer(
        examples["review_title"],
        max_length=max_target_length,
        truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [33]:
tokenized_datasets = books_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/9672 [00:00<?, ? examples/s]

Map:   0%|          | 0/238 [00:00<?, ? examples/s]

Map:   0%|          | 0/245 [00:00<?, ? examples/s]