In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
import torch
import evaluate
import glob
import numpy as np
import pandas as pd

import warnings 

In [2]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [3]:
dataset = load_dataset("dair-ai/emotion")
# dataset = load_dataset("tblard/allocine")

In [None]:
df = pd.DataFrame(data=dataset['train'])
df

In [None]:
dataset['train']

In [None]:
dataset.keys()

In [7]:
def process_function(text):
  return tokenizer(
        text['text'],
        # text['review'],
        truncation=True
  )

In [None]:
dataset = dataset.map(process_function)

In [9]:
train_dataset, val_dataset, test_dataset = dataset['train'], dataset['validation'], dataset['test']

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
num_labels = len(set([elem['label'] for elem in train_dataset]))
num_labels

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    num_labels=num_labels
)

In [None]:
training_args = TrainingArguments(
  output_dir="results",
  learning_rate=3e-4,
  per_device_train_batch_size=64,
  per_device_eval_batch_size=8,
  num_train_epochs=10,
  weight_decay=0.01,
  evaluation_strategy="epoch",
  save_strategy="epoch",
  load_best_model_at_end=True,
  save_total_limit=3,
  fp16=False
)

In [16]:
accuracy = evaluate.load("accuracy")

In [None]:
accuracy

In [18]:
def compute_metrics(output):
  preds, labels = output
  preds = np.argmax(preds, axis=1) # numpy arrays
  # return accuracy.compute(predictions=preds, references=labels)
  return {"accuracy": (preds == labels).sum() / len(preds)}

In [None]:
compute_metrics(
    (np.array([
        [
          0.8,
          0.9,
          0.7,
        ],
        [
          0.9,
          0.8,
          0.7,
        ]
]), np.array([1,1]))
)

In [None]:
compute_metrics

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()