# Amazon Reviews Classification Modelling using Transformers
```
@inproceedings{marc_reviews,
    title={The Multilingual Amazon Reviews Corpus},
    author={Keung, Phillip and Lu, Yichao and Szarvas, György and Smith, Noah A.},
    booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing},
    year={2020}
}
```

In [52]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoModel
from transformers_interpret import SequenceClassificationExplainer
import evaluate
import numpy as np

In [2]:
sys.path.append("../../")

from src.data.dataset import load_amazon_dataset
from src.data.dataset import split_dataset

from src.data.torch_datasets import AmazonTokensDataset

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
MODEL_NAME = "distilbert-base-multilingual-cased"

Loading data

In [5]:
# Loading dataset
dataset_df = load_amazon_dataset(return_pandas=True, languages=["en"], use_stars=False, n_sample=5000)

Found cached dataset amazon_reviews_multi (/home/dqmis/.cache/huggingface/datasets/amazon_reviews_multi/default-18df3f9c3df27db5/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)


In [11]:
# Split into train and test
train_df, val_df, test_df = split_dataset(dataset_df)

x_train, y_train = list(train_df["review_body"].values), list(train_df["label"].values)
x_val, y_val = list(val_df["review_body"].values), list(val_df["label"].values)
x_test, y_test = list(test_df["review_body"].values), list(test_df["label"].values)

In [13]:
# defining a tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [14]:
train_tokens = tokenizer(x_train, truncation=True, padding=True)
val_tokens = tokenizer(x_val, truncation=True, padding=True)
test_tokens = tokenizer(x_test, truncation=True, padding=True)

In [15]:
# Loading datasets

train_dataset = AmazonTokensDataset(train_tokens, y_train)
val_dataset = AmazonTokensDataset(val_tokens, y_val)
test_dataset = AmazonTokensDataset(test_tokens, y_test)

In [18]:
# Defining metrics

accuracy_metric = evaluate.load("accuracy")
f1_score = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "precision": precision_metric.compute(predictions=predictions, references=labels)["precision"],
        "recall": recall_metric.compute(predictions=predictions, references=labels)["recall"],
        "f1": f1_score.compute(predictions=predictions, references=labels)["f1"]
    }

In [19]:
# Defining training args

training_args = TrainingArguments(
    num_train_epochs=2,
    output_dir="./output",
    logging_dir="./logs",
    evaluation_strategy="steps",
    per_device_eval_batch_size=32,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    eval_steps=100,
    logging_steps=10,
)

In [20]:
# Defining model
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.weight', 'classif

In [21]:
# Defining trainer

trainer = Trainer(
    model = model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [22]:
# Training the model

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mvinted_dqmis[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


TrainOutput(global_step=900, training_loss=0.4198397819201152, metrics={'train_runtime': 280.6294, 'train_samples_per_second': 25.657, 'train_steps_per_second': 3.207, 'total_flos': 953765270323200.0, 'train_loss': 0.4198397819201152, 'epoch': 2.0})

In [23]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.32985612750053406,
 'eval_accuracy': 0.883,
 'eval_precision': 0.9098712446351931,
 'eval_recall': 0.8496993987975952,
 'eval_f1': 0.8787564766839377,
 'eval_runtime': 8.0077,
 'eval_samples_per_second': 124.88,
 'eval_steps_per_second': 3.996,
 'epoch': 2.0}

In [24]:
trainer.save_model("./trained_model")

In [26]:
# To upload model to Hugginface Hub do this:

# trainer.push_to_hub()

### Feature importance

In [39]:
cls_explainer = SequenceClassificationExplainer(model, tokenizer)
word_attributions = cls_explainer(dataset_df.review_body.values[1])
cls_explainer.visualize("distilbert_viz.html");

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,NEGATIVE (1.00),NEGATIVE,2.71,"[CLS] The so ##cks are cut ##e but not usa ##ble for barre class , the fa ##bric is too think and the small was way to loose fit ##ting on my size 7 . 5 foot , I kept sl ##ip ##ping during plan ##ks and the so ##cks were falling off of my foot . I so wanted to find a more af ##ford ##able alternative to the Pure Barre brand ##ed so ##cks . But these just didn [UNK] t do it . [SEP]"
,,,,


In [40]:
cls_explainer = SequenceClassificationExplainer(model, tokenizer)
word_attributions = cls_explainer(dataset_df.review_body.values[110])
cls_explainer.visualize("distilbert_viz.html");

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,POSITIVE (1.00),POSITIVE,2.84,"[CLS] really sat ##isfied with it ! good price for it , good quality also [SEP]"
,,,,


In [28]:
cls_explainer = SequenceClassificationExplainer(model, tokenizer)
word_attributions = cls_explainer("This laptop is amazing an awful product :(!")
cls_explainer.visualize("distilbert_viz.html");

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,POSITIVE (0.83),POSITIVE,1.71,[CLS] This lap ##top is ama ##zing an aw ##ful product : ( ! [SEP]
,,,,


### Inference

In [29]:
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."

In [30]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model=AutoModelForSequenceClassification.from_pretrained("./trained_model/"), tokenizer=tokenizer)
classifier(text)

[{'label': 'POSITIVE', 'score': 0.9568187594413757}]

## Test on multiple languages

In [32]:
# Loading dataset
dataset_df = load_amazon_dataset(return_pandas=True, languages=["en", "de", "es"], use_stars=False, n_sample=5000)

# Split into train and test
train_df, val_df, test_df = split_dataset(dataset_df)

x_train, y_train = list(train_df["review_body"].values), list(train_df["label"].values)
x_val, y_val = list(val_df["review_body"].values), list(val_df["label"].values)
x_test, y_test = list(test_df["review_body"].values), list(test_df["label"].values)

# Encoding data
train_tokens = tokenizer(x_train, truncation=True, padding=True)
val_tokens = tokenizer(x_val, truncation=True, padding=True)
test_tokens = tokenizer(x_test, truncation=True, padding=True)

# Loading datasets
train_dataset = AmazonTokensDataset(train_tokens, y_train)
val_dataset = AmazonTokensDataset(val_tokens, y_val)
test_dataset = AmazonTokensDataset(test_tokens, y_test)

Found cached dataset amazon_reviews_multi (/home/dqmis/.cache/huggingface/datasets/amazon_reviews_multi/default-900fce4a1c2f2d48/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)


In [33]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2, id2label=id2label, label2id=label2id)

# Defining trainer
trainer = Trainer(
    model = model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.weight', 'classif

In [34]:
trainer.train()



Step,Training Loss,Validation Loss


TrainOutput(global_step=900, training_loss=0.4594777258237203, metrics={'train_runtime': 236.017, 'train_samples_per_second': 30.506, 'train_steps_per_second': 3.813, 'total_flos': 875526712992000.0, 'train_loss': 0.4594777258237203, 'epoch': 2.0})

In [36]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.46012255549430847,
 'eval_accuracy': 0.838,
 'eval_precision': 0.9384615384615385,
 'eval_recall': 0.7261904761904762,
 'eval_f1': 0.8187919463087249,
 'eval_runtime': 7.4265,
 'eval_samples_per_second': 134.653,
 'eval_steps_per_second': 4.309,
 'epoch': 2.0}

In [37]:
cls_explainer = SequenceClassificationExplainer(model, tokenizer)
word_attributions = cls_explainer("This was a really awesome laptop. Can recommend to anyone")
cls_explainer.visualize("distilbert_viz.html");

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,POSITIVE (0.96),POSITIVE,1.62,[CLS] This was a really aw ##eso ##me lap ##top . Can re ##com ##mend to anyone [SEP]
,,,,


In [39]:
cls_explainer = SequenceClassificationExplainer(model, tokenizer)
word_attributions = cls_explainer("Das war ein wirklich toller Laptop. Kann es jedem empfehlen")
cls_explainer.visualize("distilbert_viz.html");

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,POSITIVE (0.98),POSITIVE,2.02,[CLS] Das war ein wir ##klich toll ##er La ##pt ##op . Kan ##n es jedem em ##pf ##eh ##len [SEP]
,,,,


## Extracting embeddings

In [53]:
import torch

input_to_model = tokenizer(["Hello, this is something"], return_tensors="pt")

model = AutoModel.from_pretrained(MODEL_NAME, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [72]:
model(**input_to_model)[0]

tensor([[[ 0.1642, -0.0755,  0.1278,  ...,  0.4243, -0.0163, -0.0839],
         [-0.0090, -0.0340,  0.3609,  ...,  0.4106, -0.3255, -0.2726],
         [ 0.0798, -0.1538,  0.1515,  ...,  0.5291, -0.0641,  0.1844],
         ...,
         [ 0.2311, -0.2098,  0.0543,  ...,  0.5497, -0.1932,  0.1050],
         [ 0.2450, -0.2828, -0.1367,  ...,  0.4614, -0.0438, -0.1428],
         [ 0.2126, -0.0780,  0.3494,  ...,  0.2476, -0.0485,  0.0375]]],
       grad_fn=<NativeLayerNormBackward0>)