In [1]:
import json
from pathlib import Path

import evaluate
import numpy as np
import torch
from datasets import load_dataset
from huggingface_hub import login
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

from mva_snlp_canine.nli.defaults import (
    MODEL_LIST,
    MODEL_POSTFIX,
    N_JOBS,
    NO_PBAR,
    NUM_LABELS,
    TOKEN,
    TOKENIZED_HUB_PATH,
    TRAINING_HUB_PATH,
    TRAINING_KWARGS_PATH,
    TRAINING_OUTPUT_DIR,
)

In [22]:
clf_metrics = evaluate.combine(
    [
        evaluate.load("accuracy"),
        evaluate.load("f1", average="macro"),
        evaluate.load("precision", average="macro"),
        evaluate.load("recall", average="macro"),
    ]
)

In [23]:
clf_metrics.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [13]:
experiment_name = "exp1"
dataset_name_or_path = TOKENIZED_HUB_PATH


args_file = Path(TRAINING_KWARGS_PATH)
if args_file.exists():
    with args_file.open() as f:
        training_kwargs = json.load(f)
else:
    raise ValueError(f"File {args_file} does not exist.")
print(f"--- Loading training arguments from {TRAINING_KWARGS_PATH}")
print(f"--- Training arguments: {training_kwargs}")

for model_name_or_path, postfix in zip(MODEL_LIST, MODEL_POSTFIX):
    print(f"Training for {model_name_or_path}...")

    tokenized_dataset_path = TOKENIZED_HUB_PATH.format(
        experiment_name=experiment_name, postfix=postfix
    )
    experiment_output_dir = TRAINING_OUTPUT_DIR.format(
        experiment_name=experiment_name, postfix=postfix
    )
    experiment_hub_path = TRAINING_HUB_PATH.format(
        experiment_name=experiment_name, postfix=postfix
    )

--- Loading training arguments from mva_snlp_canine/nli/default_training_args.json
--- Training arguments: {'evaluation_strategy': 'epoch', 'overwrite_output_dir': False, 'gradient_checkpointing': True, 'fp16': True, 'do_train': True, 'do_eval': True, 'learning_rate': 5e-05, 'gradient_accumulation_steps': 1, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'num_train_epochs': 3}
Training for bert-base-multilingual-cased...
Training for google/canine-s...
Training for google/canine-c...


In [14]:
tokenized_dataset_path

'Gwatk/exp1_xnli_subset_tokenized_canine_c'

In [22]:
print(f"--- Loading the dataset from {tokenized_dataset_path}...")
dataset = load_dataset(tokenized_dataset_path)

print(f"--- Loading the model {model_name_or_path}...")
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path, num_labels=3
)

print(f"--- Loading the tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer, max_length=2048, padding=True, pad_to_multiple_of=4
)

print(f"--- Cuda is available: {torch.cuda.is_available()}")

print("--- Preparing the training...")
if experiment_hub_path:
    exp_name = experiment_hub_path
    push_to_hub = True
else:
    exp_name = experiment_output_dir
    push_to_hub = False

training_args = TrainingArguments(
    output_dir=exp_name,
    push_to_hub=push_to_hub,
    **training_kwargs,
)

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return clf_metrics.compute(predictions=predictions, references=labels)

--- Loading the dataset from Gwatk/exp1_xnli_subset_tokenized_canine_c...


Found cached dataset parquet (/home/gwatk/.cache/huggingface/datasets/Gwatk___parquet/Gwatk--exp1_xnli_subset_tokenized_canine_c-be96dc45d10f252a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

--- Loading the model google/canine-c...


Some weights of CanineForSequenceClassification were not initialized from the model checkpoint at google/canine-c and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Loading the tokenizer...


Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


--- Cuda is available: True
--- Preparing the training...


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [24]:
print(f"--- Training the model, pushing to {exp_name}...")
trainer.train()

--- Training the model, pushing to Gwatk/exp1_xnli_subset_finetuned_canine_c...




  0%|          | 0/11250 [00:00<?, ?it/s]



OutOfMemoryError: CUDA out of memory. Tried to allocate 986.00 MiB (GPU 0; 7.79 GiB total capacity; 4.47 GiB already allocated; 1020.81 MiB free; 4.96 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.push_to_hub()