# Amazon Reviews Classification Modelling using Transformers
```
@inproceedings{marc_reviews,
    title={The Multilingual Amazon Reviews Corpus},
    author={Keung, Phillip and Lu, Yichao and Szarvas, György and Smith, Noah A.},
    booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing},
    year={2020}
}
```

In [2]:
import sys
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers_interpret import SequenceClassificationExplainer
import evaluate
import numpy as np

In [3]:
sys.path.append("../../")

from src.review.data.dataset import load_dataset
from src.review.data.dataset import split_dataset

from src.review.data.torch_datasets import AmazonTokensDataset

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
MODEL_NAME = "distilbert-base-multilingual-cased"

Loading data

In [6]:
# Loading dataset
dataset_df = load_dataset(return_pandas=True, languages=["en"], use_stars=False, n_sample=10000)

Found cached dataset amazon_reviews_multi (/home/dqmis/.cache/huggingface/datasets/amazon_reviews_multi/default-18df3f9c3df27db5/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)


In [7]:
# Split into train and test
train_df, val_df, test_df = split_dataset(dataset_df)

x_train, y_train = list(train_df["review_body"].values), list(train_df["label"].values)
x_val, y_val = list(val_df["review_body"].values), list(val_df["label"].values)
x_test, y_test = list(test_df["review_body"].values), list(test_df["label"].values)

In [8]:
# defining a tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [9]:
train_tokens = tokenizer(x_train, truncation=True, padding=True)
val_tokens = tokenizer(x_val, truncation=True, padding=True)
test_tokens = tokenizer(x_test, truncation=True, padding=True)

In [10]:
# Loading datasets

train_dataset = AmazonTokensDataset(train_tokens, y_train)
val_dataset = AmazonTokensDataset(val_tokens, y_val)
test_dataset = AmazonTokensDataset(test_tokens, y_test)

In [11]:
# Defining metrics

accuracy_metric = evaluate.load("accuracy")
f1_score = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "precision": precision_metric.compute(predictions=predictions, references=labels)["precision"],
        "recall": recall_metric.compute(predictions=predictions, references=labels)["recall"],
        "f1": f1_score.compute(predictions=predictions, references=labels)["f1"]
    }

In [12]:
# Defining training args

training_args = TrainingArguments(
    num_train_epochs=2,
    output_dir="./output",
    logging_dir="./logs",
    evaluation_strategy="steps",
    per_device_eval_batch_size=32,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    eval_steps=100,
    logging_steps=10,
)

In [35]:
# Defining model
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'pre_classifie

In [36]:
# Defining trainer

trainer = Trainer(
    model = model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [37]:
# Training the model

trainer.train()



Step,Training Loss,Validation Loss


TrainOutput(global_step=1800, training_loss=0.3631111474831899, metrics={'train_runtime': 575.8673, 'train_samples_per_second': 25.006, 'train_steps_per_second': 3.126, 'total_flos': 1907530540646400.0, 'train_loss': 0.3631111474831899, 'epoch': 2.0})

In [38]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.4391825795173645,
 'eval_accuracy': 0.88,
 'eval_precision': 0.8808080808080808,
 'eval_recall': 0.8772635814889336,
 'eval_f1': 0.8790322580645161,
 'eval_runtime': 14.1452,
 'eval_samples_per_second': 141.391,
 'eval_steps_per_second': 4.454,
 'epoch': 2.0}

In [47]:
trainer.save_model("./trained_model")

### Feature importance

In [39]:
cls_explainer = SequenceClassificationExplainer(model, tokenizer)
word_attributions = cls_explainer(dataset_df.review_body.values[1])
cls_explainer.visualize("distilbert_viz.html");

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,NEGATIVE (1.00),NEGATIVE,2.71,"[CLS] The so ##cks are cut ##e but not usa ##ble for barre class , the fa ##bric is too think and the small was way to loose fit ##ting on my size 7 . 5 foot , I kept sl ##ip ##ping during plan ##ks and the so ##cks were falling off of my foot . I so wanted to find a more af ##ford ##able alternative to the Pure Barre brand ##ed so ##cks . But these just didn [UNK] t do it . [SEP]"
,,,,


In [40]:
cls_explainer = SequenceClassificationExplainer(model, tokenizer)
word_attributions = cls_explainer(dataset_df.review_body.values[110])
cls_explainer.visualize("distilbert_viz.html");

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,POSITIVE (1.00),POSITIVE,2.84,"[CLS] really sat ##isfied with it ! good price for it , good quality also [SEP]"
,,,,


### Inference

In [42]:
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."

In [48]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model=AutoModelForSequenceClassification.from_pretrained("./trained_model/"), tokenizer=tokenizer)
classifier(text)

[{'label': 'POSITIVE', 'score': 0.996570348739624}]