<a href="https://colab.research.google.com/github/davidandw190/faas-dl-inference/blob/main/notebooks/multi_label_sentiment_anlaysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Multi-Label Sentiment Analysis

This is an improved version of the sentiment analysis model prototype, some of the modifications being:
 - it uses the `go_emotions` dataset with 28 emotions,instead of the `emotion` dataset used previously, with only 6 emotions.
 - it uses `microsoft/xtremedistil-l6-h384-uncased` (384 hidden layers) instead of `microsoft/xtremedistil-l6-h256-uncased` (256 hidden layers)
 - shift to multi-label classification (multiple emotions per text so more realistic) from the prior, single-label classification


*dataset adjustements were inspired by: https://colab.research.google.com/drive/1aue7x525rKy6yYLqqt-5Ll96qjQvpqS7#scrollTo=Dcw8-k4lO5Yk

In [None]:
%pip install datasets transformers onnx onnxruntime

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import transformers
import transformers.convert_graph_to_onnx as onnx_convert
from pathlib import Path
from onnxruntime.quantization import quantize_dynamic, QuantType
from google.colab import files

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
model_name = 'microsoft/xtremedistil-l6-h384-uncased'
emotions = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
    'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
ds = load_dataset("go_emotions", "raw")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)

In [None]:
def prepare_dataset(dataset):
    dataset = dataset.map(lambda x: {"labels": [x[c] for c in emotions]})

    cols = dataset["train"].column_names
    cols.remove("labels")
    dataset = dataset.map(tokenize_function, batched=True, remove_columns=cols)
    dataset.set_format("torch")
    dataset = (dataset
               .map(lambda x: {"float_labels": x["labels"].to(torch.float)}, remove_columns=["labels"])
               .rename_column("float_labels", "labels"))
    return dataset

In [None]:
ds_enc = prepare_dataset(ds)
print(ds_enc['train'].features)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(emotions),
    problem_type="multi_label_classification"
).to(device)

In [None]:
training_args = TrainingArguments(
    "improved_sentiment_classifier",
    per_device_train_batch_size=128,
    num_train_epochs=4,
    learning_rate=3e-05,
    evaluation_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_enc['train']
)

trainer.train()

In [None]:
model_path = "improved_sentiment_classifier"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
pipeline = transformers.pipeline("imroved_sentiment_classifier", model=model_path, tokenizer=model_path)
onnx_convert.convert_pytorch(
    pipeline,
    opset=14,
    output=Path("improved_sentiment_classifier.onnx"),
    use_external_format=False
)

In [None]:
quantize_dynamic(
    "improved_sentiment_classifier.onnx",
    "improved_sentiment_classifier-int8.onnx",
    weight_type=QuantType.QUInt8
)

In [None]:
files.download("improved_sentiment_classifier-int8.onnx")