# Sentiment training
This notebook shows how to fine-tune a model on a few different sentiment datasets.

In [None]:
import sys

sys.path.append("..")  # ensure we can run examples as-is in the package's poetry env

In [None]:
import torch
import transformers
from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset, load_metric
from transformers import AutoConfig, AutoModel, AutoTokenizer, TrainingArguments

from grouphug import AutoMultiTaskModel, ClassificationHeadConfig, DatasetFormatter, LMHeadConfig, MultiTaskTrainer
from grouphug.config import logger

torch.cuda.is_available()

## Define which model to fine-tune

In [None]:
# transformers.logging.set_verbosity_info()  # uncomment for more logging
base_model = "prajjwal1/bert-tiny"

## Load data

In [None]:
gp_data = load_dataset("IsaacBot/GP-Sentiment").rename_column("content", "text")
imdb_data = load_dataset("imdb").rename_column("label", "negpos")

## Define tokenizer and preprocess data

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
fmt = DatasetFormatter().tokenize(max_length=512).encode('score')
data = fmt.apply({"gp": gp_data, "imdb": imdb_data}, tokenizer=tokenizer, splits=["train", "test"])

## Define model

In [None]:
head_configs = (
    [  # as labels are different, we create different classifier heads for each task, but the base model is shared
        ClassificationHeadConfig.from_data(data, "score", classifier_hidden_size=50),
        ClassificationHeadConfig.from_data(data, "negpos", classifier_hidden_size=20, weight=2),
    ]
)

In [None]:
model = AutoMultiTaskModel.from_pretrained(base_model, head_configs, formatter=fmt, tokenizer=tokenizer)

## Train the model

In [None]:
output_dir = "../output/demo"
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    save_total_limit=1,
    evaluation_strategy="epoch",
)

trainer = MultiTaskTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_data=data[:, "train"],
    eval_data=data[:, "test"],
)

In [None]:
train_res = trainer.train()

## The model predict function takes dicts or entire datasets and preprocesses, infers, and maps back to labels

In [None]:
model.predict({"text": "This will predict both things at once, giving probabilities, labels, and predicted ids. Awesome!"})