## Fine-Tuning a Transformer

In [None]:
!pip install datasets
!pip install huggingface_hub
!pip install transformers
!pip install umap-learn
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 5.3 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 60.5 MB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 37.6 MB/s 
Collecting dill<0.3.6
  Downloading dill-0.3.5.1-py2.py3-none-any.whl (95 kB)
[K     |████████████████████████████████| 95 kB 4.4 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 42.2 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Dow

In [None]:
# check runtime
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
# check GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Not connected to a GPU


## Import Dataset from 🤗 Hub

In [None]:
# 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 
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [None]:
from datasets import load_dataset
dataset = load_dataset("cjbarrie/valmasress")



Downloading and preparing dataset csv/cjbarrie--valmasress to /root/.cache/huggingface/datasets/cjbarrie___csv/cjbarrie--valmasress-b5722d98f47799b3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/5.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.67M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/cjbarrie___csv/cjbarrie--valmasress-b5722d98f47799b3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from google.colab import data_table
data_table.enable_dataframe_formatter()

In [None]:
from transformers import AutoTokenizer

model_ckpt = "bert-base-multilingual-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

In [None]:
# from transformers import AutoModel
# import torch

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
import torch
from transformers import AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_labels = 4
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

Downloading:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

In [None]:
labels = ['neutral', 'critical', 'uncritical', 'not-applicable']

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(dataset_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-masress"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=5,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True, 
                                  log_level="error")

In [None]:
test_ds = dataset["test"]
print(test_ds.features)

{'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}


In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=dataset_encoded["train"],
                  eval_dataset=dataset_encoded["test"],
                  tokenizer=tokenizer)
trainer.train();

Cloning https://huggingface.co/cjbarrie/bert-base-multilingual-uncased-finetuned-masress into local empty directory.


Download file pytorch_model.bin:   0%|          | 1.47k/638M [00:00<?, ?B/s]

Download file runs/Oct24_14-58-57_073a6d3fc115/events.out.tfevents.1666623830.073a6d3fc115.146.0:  33%|###3   …

Download file training_args.bin: 100%|##########| 3.36k/3.36k [00:00<?, ?B/s]

Download file runs/Oct24_14-58-57_073a6d3fc115/1666623830.7229273/events.out.tfevents.1666623830.073a6d3fc115.…

Download file runs/Oct23_17-56-35_91d4c8a8507f/events.out.tfevents.1666547804.91d4c8a8507f.2223.0:  64%|######…

Download file runs/Oct23_17-56-35_91d4c8a8507f/1666547804.891664/events.out.tfevents.1666547804.91d4c8a8507f.2…

Clean file runs/Oct24_14-58-57_073a6d3fc115/events.out.tfevents.1666623830.073a6d3fc115.146.0:  18%|#8        …

Clean file training_args.bin:  30%|##9       | 1.00k/3.36k [00:00<?, ?B/s]

Clean file runs/Oct24_14-58-57_073a6d3fc115/1666623830.7229273/events.out.tfevents.1666623830.073a6d3fc115.146…

Clean file runs/Oct23_17-56-35_91d4c8a8507f/events.out.tfevents.1666547804.91d4c8a8507f.2223.0:  18%|#8       …

Clean file runs/Oct23_17-56-35_91d4c8a8507f/1666547804.891664/events.out.tfevents.1666547804.91d4c8a8507f.2223…

Clean file pytorch_model.bin:   0%|          | 1.00k/638M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.1646,1.062585,0.558836,0.556641
2,0.9281,0.980019,0.586902,0.579161
3,0.8269,1.013402,0.59106,0.577498
4,0.7335,1.064421,0.586071,0.581589
5,0.6786,1.094562,0.57817,0.576931


In [None]:
trainer.push_to_hub(commit_message="Test training multilingual bert")

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/638M [00:00<?, ?B/s]

Upload file runs/Oct31_20-17-16_07b4055adb3d/events.out.tfevents.1667247749.07b4055adb3d.81.0:  48%|####7     …

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/cjbarrie/bert-base-multilingual-uncased-finetuned-masress
   ed1158a..a8789cc  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/cjbarrie/bert-base-multilingual-uncased-finetuned-masress
   ed1158a..a8789cc  main -> main

To https://huggingface.co/cjbarrie/bert-base-multilingual-uncased-finetuned-masress
   a8789cc..8c20fa9  main -> main

   a8789cc..8c20fa9  main -> main



'https://huggingface.co/cjbarrie/bert-base-multilingual-uncased-finetuned-masress/commit/a8789cc7965fef8d9bae1fff07c69ab0e92f22f3'

In [None]:
preds_output = trainer.predict(dataset_encoded["test"])

In [None]:
preds_output.metrics

{'test_loss': 1.0945619344711304,
 'test_accuracy': 0.5781704781704782,
 'test_f1': 0.5769310469298662,
 'test_runtime': 1355.7918,
 'test_samples_per_second': 3.548,
 'test_steps_per_second': 0.056}