In [None]:
!pip install datasets;
!pip install evaluate;

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/507.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/507.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dil

In [None]:
from datasets import load_dataset, Dataset, concatenate_datasets, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch
from tqdm.auto import tqdm
import evaluate

In [None]:
# hyperparameters
lr = 5e-5
num_epochs = 8
batch_size = 8
model_name = "bert-base-cased"


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=8)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
data_files = {
    "train": "MaSaC_train_erc_task1.json",
    "validation": "MaSaC_validation_erc_task1.json"
}

dataset = load_dataset("json", data_files=data_files)
dataset = dataset.remove_columns("episode")

labels = ['anger', 'neutral', 'contempt', 'sadness', 'fear', 'disgust', 'joy', 'surprise']
label_encoder = LabelEncoder()
label_encoder.fit(labels)


combined_dataset = {}
for dataset_type in dataset:
    for record in dataset[dataset_type]:
        encoded_label = label_encoder.transform(record["emotions"])
        record["emotions"] = encoded_label
        if combined_dataset.get(dataset_type) is None:
            combined_dataset[dataset_type] = Dataset.from_dict(record)
        else:
            combined_dataset[dataset_type] = concatenate_datasets([combined_dataset[dataset_type], Dataset.from_dict(record)])

dataset_dict = DatasetDict(combined_dataset)


In [None]:
def tokenize_function(example):
    return tokenizer(example["utterances"], padding="max_length", truncation=True)

tokenized_dataset = dataset_dict.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("emotions", "labels")
tokenized_dataset = tokenized_dataset.remove_columns(["speakers", "utterances"])

tokenized_dataset.set_format("torch")
print(tokenized_dataset)



Map:   0%|          | 0/8506 [00:00<?, ? examples/s]

Map:   0%|          | 0/1354 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8506
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1354
    })
})


In [None]:
train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=True, batch_size=batch_size)
eval_dataloader = DataLoader(tokenized_dataset["validation"], batch_size=batch_size)


optimizer = AdamW(model.parameters(), lr=lr)
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)


In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print(device)

cuda


In [None]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


  0%|          | 0/8512 [00:00<?, ?it/s]

In [None]:
metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.4342688330871492}

In [None]:
metric = evaluate.load("f1")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute(average="micro")

{'f1': 0.4342688330871492}

In [None]:
metric1 = evaluate.load("accuracy")
metric2 = evaluate.load("f1")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric1.add_batch(predictions=predictions, references=batch["labels"])
    metric2.add_batch(predictions=predictions, references=batch["labels"])


m1 = metric1.compute()
m2 = metric2.compute(average="micro")
print(f"accuracy: {m1}")
print(f"f1: {m2}")


accuracy: {'accuracy': 0.4342688330871492}
f1: {'f1': 0.4342688330871492}


In [None]:
model.save_pretrained(f'hf_t10_t1_{num_epochs}_epoch_1_{model_name}')


In [None]:
torch.save(model, f'pt_t10_t1_{num_epochs}_epoch_1_{model_name}.pth')
