In [1]:
# imports
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoConfig, BertTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EvalPrediction
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import DataLoader
from torch import Tensor

import os
from dotenv import load_dotenv
from tqdm import tqdm
import evaluate
import wandb as wandb

In [2]:
# load env variables
load_dotenv()

True

In [3]:
hf_home_dir = os.environ['HF_HOME']
transformer_dir = os.environ['TRANSFORMERS_CACHE']
print(hf_home_dir)
print(transformer_dir)

/data/users/dhananjay/.cache/huggingface
/data/users/dhananjay/.cache/huggingface


In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [5]:
#french_data = load_dataset('Brendan/nlp244_french_snli', cache_dir=hf_home_dir)
french_data = load_dataset('dhananjay1210/SNLI_French', cache_dir=hf_home_dir)

Found cached dataset parquet (/data/users/dhananjay/.cache/huggingface/dhananjay1210___parquet/dhananjay1210--SNLI_French-89304141788bdf04/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
train = french_data['train']
val = french_data['validation']
test = french_data['test']

In [7]:
# load model
config = AutoConfig.from_pretrained("cmarkea/distilcamembert-base")
tokenizer = AutoTokenizer.from_pretrained("cmarkea/distilcamembert-base")
model = AutoModelForSequenceClassification.from_pretrained("cmarkea/distilcamembert-base", num_labels=3, id2label={i: i for i in range(3)}).to(device)

Some weights of the model checkpoint at cmarkea/distilcamembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at cmarkea/distilcamembert-base and are newly initialized: ['classifier.dense.bias', 'classif

In [8]:
# tokenize and batchify data
train = train.map(lambda batch: tokenizer(batch['premise'], batch['hypothesis'], truncation=True, padding=True), batched=True, batch_size=256)
val = val.map(lambda batch: tokenizer(batch['premise'], batch['hypothesis'], truncation=True, padding=True), batched=True, batch_size=256)

Loading cached processed dataset at /data/users/dhananjay/.cache/huggingface/dhananjay1210___parquet/dhananjay1210--SNLI_French-89304141788bdf04/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-c12e0c244ae1118e.arrow


Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

In [9]:
train

Dataset({
    features: ['premise', 'hypothesis', 'label', 'input_ids', 'attention_mask'],
    num_rows: 100000
})

In [10]:
train.set_format(type="pt", columns=['input_ids', 'attention_mask', 'label'])
val.set_format(type="pt", columns=['input_ids', 'attention_mask', 'label'])

In [11]:
train[0]

{'label': tensor(1),
 'input_ids': tensor([    5,   180,   314,    15,  2833, 13600,    32,    23,  6439,   570,
          6378,   958,     9,     6,     6,   180,   314,  6431,    58,  2833,
            24,    23,  1477,     9,     6,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

In [12]:
f1_metric = evaluate.load("f1", average="micro")

In [13]:
# define evaluation 
def metric(eval_prediction):
    logits, labels = eval_prediction.predictions, eval_prediction.label_ids
    predictions = logits.argmax(axis=1)
    return f1_metric.compute(predictions=predictions, references=labels, average="micro")

In [14]:
output_checkpoint_dir = "./checkpoints"

training_args: TrainingArguments = TrainingArguments(
        output_dir=output_checkpoint_dir,
        do_train=True,
        do_eval=True,
        do_predict=True,
        evaluation_strategy="steps",
        eval_steps=128,
        per_device_train_batch_size=256,
        per_device_eval_batch_size=128,
        save_steps=128,
        save_strategy="steps",
        save_total_limit=5,
        logging_steps=50,
        num_train_epochs=10,
        metric_for_best_model="f1",
        load_best_model_at_end=True,
        dataloader_num_workers=0
    )

In [16]:
 trainer: Trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=None, 
        train_dataset=train,
        eval_dataset=val,
        tokenizer=tokenizer,
        compute_metrics=metric,
    )

In [17]:
# train model
trainer.train()
model = trainer.model

The following columns in the training set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: premise, hypothesis. If premise, hypothesis are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 100000
  Num Epochs = 10
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 3910
  Number of trainable parameters = 68097027
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mdsonawan[0m ([33mnlp244_quest[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,F1
128,0.871,0.756315,0.670697
256,0.7411,0.690731,0.704735
384,0.7188,0.666707,0.722211
512,0.6577,0.656913,0.723227
640,0.6547,0.661402,0.723837
768,0.6429,0.638541,0.737553
896,0.5899,0.633729,0.745479
1024,0.5794,0.637754,0.739687
1152,0.5799,0.627898,0.74304
1280,0.5139,0.644374,0.74873


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: premise, hypothesis. If premise, hypothesis are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9842
  Batch size = 128
Saving model checkpoint to ./checkpoints/checkpoint-128
Configuration saved in ./checkpoints/checkpoint-128/config.json
Model weights saved in ./checkpoints/checkpoint-128/pytorch_model.bin
tokenizer config file saved in ./checkpoints/checkpoint-128/tokenizer_config.json
Special tokens file saved in ./checkpoints/checkpoint-128/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: premise, hypothesis. If premise, hypothesis are not expected by `CamembertForSequenceClassification.forward`,  you can safely

tokenizer config file saved in ./checkpoints/checkpoint-1408/tokenizer_config.json
Special tokens file saved in ./checkpoints/checkpoint-1408/special_tokens_map.json
Deleting older checkpoint [checkpoints/checkpoint-768] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: premise, hypothesis. If premise, hypothesis are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9842
  Batch size = 128
Saving model checkpoint to ./checkpoints/checkpoint-1536
Configuration saved in ./checkpoints/checkpoint-1536/config.json
Model weights saved in ./checkpoints/checkpoint-1536/pytorch_model.bin
tokenizer config file saved in ./checkpoints/checkpoint-1536/tokenizer_config.json
Special tokens file saved in ./checkpoints/checkpoint-1536/special_tokens_map.json
Deleting older checkp

Special tokens file saved in ./checkpoints/checkpoint-2688/special_tokens_map.json
Deleting older checkpoint [checkpoints/checkpoint-2176] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: premise, hypothesis. If premise, hypothesis are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9842
  Batch size = 128
Saving model checkpoint to ./checkpoints/checkpoint-2816
Configuration saved in ./checkpoints/checkpoint-2816/config.json
Model weights saved in ./checkpoints/checkpoint-2816/pytorch_model.bin
tokenizer config file saved in ./checkpoints/checkpoint-2816/tokenizer_config.json
Special tokens file saved in ./checkpoints/checkpoint-2816/special_tokens_map.json
Deleting older checkpoint [checkpoints/checkpoint-2304] due to args.save_total_limit
The following colu

In [19]:
# tokenize and batchify test data
test = test.map(lambda batch: tokenizer(batch['premise'], batch['hypothesis'], truncation=True, padding=True), batched=True, batch_size=256)

Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

In [21]:
test.set_format(type="pt", columns=['input_ids', 'attention_mask', 'label'])

In [23]:
# evaluate test data
trainer.evaluate(metric_key_prefix="test", eval_dataset=test)

The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: premise, hypothesis. If premise, hypothesis are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9824
  Batch size = 128


{'test_loss': 0.6925486922264099,
 'test_f1': 0.7487785016286646,
 'test_runtime': 2.691,
 'test_samples_per_second': 3650.658,
 'test_steps_per_second': 28.614,
 'epoch': 10.0}