In [5]:
import torch

torch.cuda.empty_cache()

In [6]:
import json
import os

import evaluate
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
    LongformerTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

In [9]:
config = {
    'model_name': 'allenai/longformer-base-4096',
    'batch_size': 32,
    'num_epochs': 1
}

In [10]:
max_length = 512

In [40]:
import shutil

torch.cuda.empty_cache()
model_name = config["model_name"]
model_path = (
    model_name.split("/")[-1].replace("-", "_") + "_text_classification_emotion" + "_adamw"
)
BATCH_SIZE = config["batch_size"]
NUM_EPOCHS = config["num_epochs"]

if os.path.exists('emotion'):
    shutil.rmtree('emotion')

emotion = load_dataset("dair-ai/emotion", cache_dir='emotion')


def filter_labels(example):
    return example['label'] in [0, 1]

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=max_length)

train = emotion['train'].filter(filter_labels)
valid = emotion['validation'].filter(filter_labels)
test = emotion['test'].filter(filter_labels)

tokenizer = LongformerTokenizer.from_pretrained(model_name)

tokenized_train = train.map(preprocess_function, batched=True, remove_columns=['text'])
tokenized_valid = valid.map(preprocess_function, batched=True, remove_columns=['text'])
tokenized_test = test.map(preprocess_function, batched=True, remove_columns=['text'])


# import shutil

# torch.cuda.empty_cache()
# model_name = config["model_name"]
# model_path = (
#     model_name.split("/")[-1].replace("-", "_") + "_text_classification_imdb" + "_adamw"
# )
# BATCH_SIZE = config["batch_size"]
# NUM_EPOCHS = config["num_epochs"]

# if os.path.exists('model_path'):
#     shutil.rmtree('model_path')

# imdb = load_dataset("imdb", cache_dir="model_path")
# imdb_train = imdb['train']
# imdb_test = imdb['test']

# tokenizer = LongformerTokenizer.from_pretrained(model_name)

# def preprocess_function(examples):
#     return tokenizer(examples["text"], truncation=True, max_length=max_length)

# tokenized_imdb_train = imdb_train.map(preprocess_function, batched=True, remove_columns=['text'])
# tokenized_imdb_test = imdb_test.map(preprocess_function, batched=True, remove_columns=['text'])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10028 [00:00<?, ? examples/s]

Map:   0%|          | 0/1254 [00:00<?, ? examples/s]

Map:   0%|          | 0/1276 [00:00<?, ? examples/s]

In [43]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {
    0: "SADNESS",
    1: "JOY",
}
label2id = {v:k for k,v in id2label.items()}

# id2label = {0: "NEGATIVE", 1: "POSITIVE"}
# label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [44]:
from transformers import LongformerConfig

cfg = LongformerConfig.from_pretrained(model_name)
cfg.attention_window = 64
cfg.max_position_embeddings = 514

In [45]:
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification

# id2label = {0: "NEGATIVE", 1: "POSITIVE"}
# label2id = {"NEGATIVE": 0, "POSITIVE": 1}

cfg.num_labels = 2
cfg.id2label = id2label
cfg.label2id = label2id

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_config(cfg)

In [46]:
from copy import deepcopy
from transformers import TrainerCallback

class CustomCallback(TrainerCallback):
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(
                eval_dataset=self._trainer.train_dataset, metric_key_prefix="train"
            )
            return control_copy

In [47]:
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [48]:
print_gpu_utilization()

GPU memory occupied: 4863 MB.


In [49]:
from transformers import logging

logging.set_verbosity_error()

training_args = TrainingArguments(
    output_dir=model_path,
    evaluation_strategy='epoch',
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    logging_steps=10,
    weight_decay=0.01,
    fp16=True,
    optim='adamw_torch',
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.add_callback(CustomCallback(trainer))

result = trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'loss': 0.8149, 'grad_norm': 4.497572422027588, 'learning_rate': 1.9363057324840767e-05, 'epoch': 0.03184713375796178}
{'loss': 0.6997, 'grad_norm': 1.9697051048278809, 'learning_rate': 1.872611464968153e-05, 'epoch': 0.06369426751592357}
{'loss': 0.7084, 'grad_norm': 1.5512731075286865, 'learning_rate': 1.8089171974522295e-05, 'epoch': 0.09554140127388536}
{'loss': 0.7106, 'grad_norm': 4.72462797164917, 'learning_rate': 1.7452229299363058e-05, 'epoch': 0.12738853503184713}
{'loss': 0.7157, 'grad_norm': 1.624863862991333, 'learning_rate': 1.6815286624203824e-05, 'epoch': 0.1592356687898089}
{'loss': 0.7065, 'grad_norm': 2.189979314804077, 'learning_rate': 1.617834394904459e-05, 'epoch': 0.1910828025477707}
{'loss': 0.6868, 'grad_norm': 1.9691681861877441, 'learning_rate': 1.5541401273885352e-05, 'epoch': 0.2229299363057325}
{'loss': 0.7118, 'grad_norm': 2.9706578254699707, 'learning_rate': 1.4904458598726114e-05, 'epoch': 0.25477707006369427}
{'loss': 0.6955, 'grad_norm': 2.2028503417

In [50]:
print_summary(result)

Time: 70.59
Samples/second: 142.06
GPU memory occupied: 7795 MB.


In [53]:
train_losses = [
    {
        k: v
        for k, v in dictionary.items()
        if k in ("train_loss", "train_accuracy", "epoch")
    }
    for dictionary in trainer.state.log_history
    if "train_accuracy" in dictionary
]

In [54]:
train_losses

[{'train_loss': 0.6493573188781738,
  'train_accuracy': 0.617271639409653,
  'epoch': 1.0}]

In [55]:
train_losses = [
    {
        k: v
        for k, v in dictionary.items()
        if k in ("train_loss", "train_accuracy", "epoch")
    }
    for dictionary in trainer.state.log_history
    if "train_accuracy" in dictionary
]
train_losses = pd.DataFrame(train_losses)
train_losses.epoch = train_losses.epoch.astype(int)

val_losses = [
    {
        k: v
        for k, v in dictionary.items()
        if k in ("eval_loss", "eval_accuracy", "epoch")
    }
    for dictionary in trainer.state.log_history
    if "eval_accuracy" in dictionary
]
val_losses = pd.DataFrame(val_losses)
val_losses.epoch = val_losses.epoch.astype(int)

# train_losses.to_csv(f"checkpoints/{model_path}/train_losses.csv", index=False)
# val_losses.to_csv(f"checkpoints/{model_path}/val_losses.csv", index=False)

In [56]:
train_losses

Unnamed: 0,train_loss,train_accuracy,epoch
0,0.649357,0.617272,1


In [57]:
val_losses

Unnamed: 0,eval_loss,eval_accuracy,epoch
0,0.660294,0.591707,1


In [58]:
output = trainer.predict(tokenized_test)

# with open(f"checkpoints/{model_path}/test_metrics.json", "w") as f:
    # json.dump(output.metrics, f, indent=4)

# with open(f"checkpoints/{model_path}/config.json", "w") as f:
    # json.dump(config, f, indent=4)

In [59]:
output.metrics

{'test_loss': 0.6605867743492126,
 'test_accuracy': 0.5963949843260188,
 'test_runtime': 2.2935,
 'test_samples_per_second': 556.349,
 'test_steps_per_second': 17.44}