# Fine-tuning Camembert-ner on JuL lyrics

## Installation

In [1]:
! pip install datasets transformers accelerate evaluate seqeval # HuggingFace 🤗
! pip install sentencepiece # Required for Camembert-ner (slow tokenizer)
! apt install git-lfs # To upload fine-tuned model to HuggingFace Hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Libraries

In [3]:
from datasets import DatasetDict, Dataset
import transformers
from transformers import (AutoTokenizer, 
                          AutoModelForTokenClassification, 
                          TrainingArguments, 
                          Trainer, 
                          DataCollatorForTokenClassification,
                          pipeline)
from transformers.integrations import TensorBoardCallback
import numpy as np
import evaluate

## Functions

In [4]:
def iob_to_dataset(lines, split):
  ''' 
  Function to convert each line of a txt file in the IOB format 
  into the format expected by camembert-ner and HuggingFace dataset 
  '''
  # Define tag to ID mapping
  tag2id = {'O': 0, 'LOC': 1, 'PER': 2, 'MISC': 3, 'ORG': 4}

  # Group IOB-formatted lines into sentences
  sentences = []
  sentence = []
  for line in lines:
    line = line.strip()
    if line:
      token, tag = line.split()
      sentence.append((token, tag))
    else:
      sentences.append(sentence)
      sentence = []
  if sentence:
    sentences.append(sentence)

  # Merge tokens and NER tags for each sentence
  tokens = []
  ner_tags = []
  for sentence in sentences:
    sentence_tokens, sentence_tags = zip(*sentence)
    tokens.append(' '.join(sentence_tokens))
    # Remove IOB tag prefixes for camembert-ner
    ner_tags.append([tag2id[tag.replace('B-', '').replace('I-', '')] for tag in sentence_tags])

  # Create a dictionary
  dataset_dict = {"id": list(range(len(tokens))),
                  "tokens": tokens,
                  "ner_tags": ner_tags}

  # Return the dataset as a Hugging Face Dataset object
  return Dataset.from_dict(dataset_dict)



def tokenize_and_align_labels(examples):
  '''
  Function to align labels with token ids
  '''
  label_all_tokens = True
  tokenized_inputs = tokenizer(examples["tokens"], 
                               truncation=True)

  labels = []
  for i, label in enumerate(examples["ner_tags"]):
    word_ids = tokenized_inputs.word_ids(batch_index=i)
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
      # Set labels of special tokens to -100 (index ignored by PyTorch)
      if word_idx is None:
        label_ids.append(-100)
      # Set label for the first token of each word
      elif word_idx != previous_word_idx:
        label_ids.append(label[word_idx])
      # Set the label to either the current label or -100
      else:
        label_ids.append(label[word_idx] if label_all_tokens else -100)
      previous_word_idx = word_idx

    labels.append(label_ids)

  tokenized_inputs["labels"] = labels
  return tokenized_inputs



def compute_metrics(p):
  '''
  Function to compute metrics on predictions
  '''
  predictions, labels = p
  predictions = np.argmax(predictions, axis=2)

  # camembert-ner needs tags without prefixes but seqeval needs it so we add "I-"
  label_list = ['O', 'I-LOC', 'I-PER', 'I-MISC', 'I-ORG']

  # Remove ignored index (special tokens)
  true_predictions = [
      [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)]
  true_labels = [
      [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)]

  results = seqeval.compute(predictions=true_predictions, references=true_labels)
  return results

## Load files

In [5]:
#Import file
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving data.txt to data.txt
User uploaded file "data.txt" with length 737193 bytes


In [6]:
# Load data
with open("/content/data.txt", "r", encoding="utf-8") as f:
    data_lines = f.readlines()

## Preprocessing

In [7]:
# Convert IOB formatted file into the format required
data = iob_to_dataset(data_lines, "data")

# Create a DatasetDict object
dataset = DatasetDict({"data": data})

# Split data into train, valid and test sets
ds_train_devtest = dataset["data"].train_test_split(test_size=0.4, train_size=0.6, seed=7)
ds_devtest = ds_train_devtest["test"].train_test_split(test_size=0.5, seed=7)

datasets = DatasetDict({"train": ds_train_devtest["train"], # 60%
                        "valid": ds_devtest["train"], # 20%
                        "test": ds_devtest["test"]}) # 20%

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")

# Tokenize and align labels of train, validation and test sets
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)
tokenized_datasets

Downloading (…)okenizer_config.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/892 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

Map:   0%|          | 0/6518 [00:00<?, ? examples/s]

Map:   0%|          | 0/2173 [00:00<?, ? examples/s]

Map:   0%|          | 0/2173 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6518
    })
    valid: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2173
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2173
    })
})

## Fine-tuning the model

In [8]:
# Label list
label_list = ['O', 'LOC', 'PER', 'MISC', 'ORG'] # {'O': 0, 'LOC': 1, 'PER': 2, 'MISC': 3, 'ORG': 4}

# Load model
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner", num_labels=5)

# Define training arguments
args = TrainingArguments("camembert-ner-finetuned-jul",
                         evaluation_strategy="epoch",
                         learning_rate=2e-5,
                         per_device_train_batch_size=16,
                         per_device_eval_batch_size=16,
                         num_train_epochs=10,
                         weight_decay=0.01,
                         push_to_hub=True)

# Batch processed examples together while applying padding to make them the same size
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Load seqeval metric commonly used to evaluate results on CONLL
seqeval = evaluate.load('seqeval')

# Load trainer
trainer = Trainer(model,
                  args,
                  train_dataset=tokenized_datasets["train"],
                  eval_dataset=tokenized_datasets["valid"],
                  data_collator=data_collator,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics,
                  callbacks=[TensorBoardCallback()])

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

You are adding a <class 'transformers.integrations.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
Cloning https://huggingface.co/fgiauna/camembert-ner-finetuned-jul into local empty directory.


Download file pytorch_model.bin:   0%|          | 8.00k/420M [00:00<?, ?B/s]

Download file runs/May11_11-17-41_e215b90831c6/events.out.tfevents.1683803871.e215b90831c6.13012.2: 100%|#####…

Download file runs/May12_14-47-21_5e0a43f0d62d/events.out.tfevents.1683902866.5e0a43f0d62d.180.5: 100%|#######…

Download file runs/Apr28_22-13-21_1bba788fe700/events.out.tfevents.1682720017.1bba788fe700.545.0: 100%|#######…

Download file sentencepiece.bpe.model:   4%|4         | 32.0k/792k [00:00<?, ?B/s]

Download file runs/May11_10-35-48_e215b90831c6/events.out.tfevents.1683801496.e215b90831c6.13012.0: 100%|#####…

Download file runs/May12_14-47-21_5e0a43f0d62d/events.out.tfevents.1683902866.5e0a43f0d62d.180.7: 100%|#######…

Clean file runs/May11_11-17-41_e215b90831c6/events.out.tfevents.1683803871.e215b90831c6.13012.2:   6%|6       …

Clean file runs/May12_14-47-21_5e0a43f0d62d/events.out.tfevents.1683902866.5e0a43f0d62d.180.5:   9%|9         …

Clean file runs/Apr28_22-13-21_1bba788fe700/events.out.tfevents.1682720017.1bba788fe700.545.0:   8%|7         …

Download file runs/May12_18-40-03_7c6169eea982/events.out.tfevents.1683916908.7c6169eea982.842.2: 100%|#######…

Clean file runs/May11_10-35-48_e215b90831c6/events.out.tfevents.1683801496.e215b90831c6.13012.0:   9%|9       …

Clean file runs/May12_14-47-21_5e0a43f0d62d/events.out.tfevents.1683902866.5e0a43f0d62d.180.7:   9%|9         …

Clean file runs/May12_18-40-03_7c6169eea982/events.out.tfevents.1683916908.7c6169eea982.842.2:   9%|9         …

Download file runs/May12_18-40-03_7c6169eea982/events.out.tfevents.1683916908.7c6169eea982.842.0: 100%|#######…

Clean file runs/May12_18-40-03_7c6169eea982/events.out.tfevents.1683916908.7c6169eea982.842.0:   9%|9         …

Download file runs/May13_15-56-31_d960191a09db/events.out.tfevents.1683993500.d960191a09db.165.2: 100%|#######…

Clean file runs/May13_15-56-31_d960191a09db/events.out.tfevents.1683993500.d960191a09db.165.2:   9%|9         …

Download file runs/May12_14-25-23_5e0a43f0d62d/events.out.tfevents.1683901532.5e0a43f0d62d.180.3: 100%|#######…

Download file runs/May01_10-58-12_d5bb5466d814/events.out.tfevents.1682938701.d5bb5466d814.222.2: 100%|#######…

Clean file runs/May12_14-25-23_5e0a43f0d62d/events.out.tfevents.1683901532.5e0a43f0d62d.180.3:  11%|#1        …

Clean file runs/May01_10-58-12_d5bb5466d814/events.out.tfevents.1682938701.d5bb5466d814.222.2:  12%|#1        …

Clean file sentencepiece.bpe.model:   0%|          | 1.00k/792k [00:00<?, ?B/s]

Download file runs/May13_15-56-31_d960191a09db/events.out.tfevents.1683993500.d960191a09db.165.0: 100%|#######…

Download file runs/Apr29_15-52-32_abe5a82602c7/events.out.tfevents.1682783558.abe5a82602c7.1059.3: 100%|######…

Download file runs/May05_16-28-00_11b82e86c118/events.out.tfevents.1683304211.11b82e86c118.2289.0:  79%|######…

Download file runs/Apr28_22-40-32_1bba788fe700/events.out.tfevents.1682721635.1bba788fe700.545.6: 100%|#######…

Download file runs/Apr29_16-06-15_abe5a82602c7/events.out.tfevents.1682784407.abe5a82602c7.1059.6: 100%|######…

Download file runs/May12_14-13-47_5e0a43f0d62d/1683900928.4358268/events.out.tfevents.1683900928.5e0a43f0d62d.…

Download file runs/May01_10-33-56_d5bb5466d814/events.out.tfevents.1682937356.d5bb5466d814.222.0: 100%|#######…

Download file runs/Apr29_13-51-01_abe5a82602c7/events.out.tfevents.1682776379.abe5a82602c7.1059.0: 100%|######…

Download file runs/May12_14-13-47_5e0a43f0d62d/events.out.tfevents.1683900928.5e0a43f0d62d.180.0: 100%|#######…

Clean file runs/May13_15-56-31_d960191a09db/events.out.tfevents.1683993500.d960191a09db.165.0:   9%|9         …

Clean file runs/Apr29_15-52-32_abe5a82602c7/events.out.tfevents.1682783558.abe5a82602c7.1059.3:  12%|#1       …

Clean file runs/May05_16-28-00_11b82e86c118/events.out.tfevents.1683304211.11b82e86c118.2289.0:  13%|#2       …

Download file runs/May13_15-56-31_d960191a09db/1683993500.0533977/events.out.tfevents.1683993500.d960191a09db.…

Download file runs/May11_11-17-41_e215b90831c6/1683803871.1820416/events.out.tfevents.1683803871.e215b90831c6.…

Clean file runs/Apr28_22-40-32_1bba788fe700/events.out.tfevents.1682721635.1bba788fe700.545.6:  15%|#5        …

Download file runs/May12_14-25-23_5e0a43f0d62d/1683901532.4341226/events.out.tfevents.1683901532.5e0a43f0d62d.…

Clean file runs/Apr29_16-06-15_abe5a82602c7/events.out.tfevents.1682784407.abe5a82602c7.1059.6:  15%|#5       …

Clean file runs/May12_14-13-47_5e0a43f0d62d/1683900928.4358268/events.out.tfevents.1683900928.5e0a43f0d62d.180…

Clean file runs/May01_10-33-56_d5bb5466d814/events.out.tfevents.1682937356.d5bb5466d814.222.0:  15%|#5        …

Download file runs/May13_15-56-31_d960191a09db/1683993500.0429025/events.out.tfevents.1683993500.d960191a09db.…

Download file runs/May12_14-47-21_5e0a43f0d62d/1683902866.978648/events.out.tfevents.1683902866.5e0a43f0d62d.1…

Clean file runs/May12_14-13-47_5e0a43f0d62d/events.out.tfevents.1683900928.5e0a43f0d62d.180.0:  16%|#5        …

Clean file runs/Apr29_13-51-01_abe5a82602c7/events.out.tfevents.1682776379.abe5a82602c7.1059.0:  15%|#5       …

Download file runs/May11_10-35-48_e215b90831c6/1683801496.7828324/events.out.tfevents.1683801496.e215b90831c6.…

Clean file runs/May13_15-56-31_d960191a09db/1683993500.0533977/events.out.tfevents.1683993500.d960191a09db.165…

Clean file runs/May11_11-17-41_e215b90831c6/1683803871.1820416/events.out.tfevents.1683803871.e215b90831c6.130…

Download file runs/May12_14-47-21_5e0a43f0d62d/1683902866.985708/events.out.tfevents.1683902866.5e0a43f0d62d.1…

Clean file runs/May12_14-25-23_5e0a43f0d62d/1683901532.4341226/events.out.tfevents.1683901532.5e0a43f0d62d.180…

Download file runs/May12_18-40-03_7c6169eea982/1683916908.590043/events.out.tfevents.1683916908.7c6169eea982.8…

Clean file runs/May13_15-56-31_d960191a09db/1683993500.0429025/events.out.tfevents.1683993500.d960191a09db.165…

Download file runs/May12_18-40-03_7c6169eea982/1683916908.5949407/events.out.tfevents.1683916908.7c6169eea982.…

Clean file runs/May12_14-47-21_5e0a43f0d62d/1683902866.978648/events.out.tfevents.1683902866.5e0a43f0d62d.180.…

Clean file runs/May11_10-35-48_e215b90831c6/1683801496.7828324/events.out.tfevents.1683801496.e215b90831c6.130…

Download file runs/May01_10-58-12_d5bb5466d814/1682938911.3119824/events.out.tfevents.1682938911.d5bb5466d814.…

Download file runs/May05_16-28-00_11b82e86c118/1683304211.304093/events.out.tfevents.1683304211.11b82e86c118.2…

Clean file runs/May12_14-47-21_5e0a43f0d62d/1683902866.985708/events.out.tfevents.1683902866.5e0a43f0d62d.180.…

Clean file runs/May12_18-40-03_7c6169eea982/1683916908.590043/events.out.tfevents.1683916908.7c6169eea982.842.…

Download file runs/May02_09-10-52_293fcc77bf7b/1683018755.0174544/events.out.tfevents.1683018755.293fcc77bf7b.…

Clean file runs/May12_18-40-03_7c6169eea982/1683916908.5949407/events.out.tfevents.1683916908.7c6169eea982.842…

Clean file runs/May01_10-58-12_d5bb5466d814/1682938911.3119824/events.out.tfevents.1682938911.d5bb5466d814.222…

Download file runs/May01_11-05-14_d5bb5466d814/1682939130.4504569/events.out.tfevents.1682939130.d5bb5466d814.…

Clean file runs/May05_16-28-00_11b82e86c118/1683304211.304093/events.out.tfevents.1683304211.11b82e86c118.2289…

Download file runs/May01_10-58-12_d5bb5466d814/1682938701.8057494/events.out.tfevents.1682938701.d5bb5466d814.…

Download file runs/Apr29_16-06-15_abe5a82602c7/1682784407.0589192/events.out.tfevents.1682784407.abe5a82602c7.…

Download file runs/May01_10-33-56_d5bb5466d814/1682937356.9049087/events.out.tfevents.1682937356.d5bb5466d814.…

Clean file runs/May02_09-10-52_293fcc77bf7b/1683018755.0174544/events.out.tfevents.1683018755.293fcc77bf7b.336…

Clean file runs/May01_11-05-14_d5bb5466d814/1682939130.4504569/events.out.tfevents.1682939130.d5bb5466d814.222…

Clean file runs/May01_10-58-12_d5bb5466d814/1682938701.8057494/events.out.tfevents.1682938701.d5bb5466d814.222…

Clean file runs/Apr29_16-06-15_abe5a82602c7/1682784407.0589192/events.out.tfevents.1682784407.abe5a82602c7.105…

Download file runs/Apr29_15-52-32_abe5a82602c7/1682783558.845829/events.out.tfevents.1682783558.abe5a82602c7.1…

Clean file runs/May01_10-33-56_d5bb5466d814/1682937356.9049087/events.out.tfevents.1682937356.d5bb5466d814.222…

Download file runs/Apr29_15-52-32_abe5a82602c7/1682783718.2118711/events.out.tfevents.1682783718.abe5a82602c7.…

Clean file runs/Apr29_15-52-32_abe5a82602c7/1682783558.845829/events.out.tfevents.1682783558.abe5a82602c7.1059…

Clean file runs/Apr29_15-52-32_abe5a82602c7/1682783718.2118711/events.out.tfevents.1682783718.abe5a82602c7.105…

Download file runs/Apr29_13-51-01_abe5a82602c7/1682776379.3457928/events.out.tfevents.1682776379.abe5a82602c7.…

Download file runs/Apr28_22-13-21_1bba788fe700/1682720681.9842465/events.out.tfevents.1682720681.1bba788fe700.…

Clean file runs/Apr29_13-51-01_abe5a82602c7/1682776379.3457928/events.out.tfevents.1682776379.abe5a82602c7.105…

Download file runs/Apr28_22-13-21_1bba788fe700/1682720017.299209/events.out.tfevents.1682720017.1bba788fe700.5…

Download file runs/Apr28_22-13-21_1bba788fe700/1682720574.042182/events.out.tfevents.1682720574.1bba788fe700.5…

Clean file runs/Apr28_22-13-21_1bba788fe700/1682720681.9842465/events.out.tfevents.1682720681.1bba788fe700.545…

Download file runs/Apr28_22-34-33_1bba788fe700/1682721280.619618/events.out.tfevents.1682721280.1bba788fe700.5…

Clean file runs/Apr28_22-13-21_1bba788fe700/1682720574.042182/events.out.tfevents.1682720574.1bba788fe700.545.…

Clean file runs/Apr28_22-13-21_1bba788fe700/1682720017.299209/events.out.tfevents.1682720017.1bba788fe700.545.…

Clean file runs/Apr28_22-34-33_1bba788fe700/1682721280.619618/events.out.tfevents.1682721280.1bba788fe700.545.…

Download file runs/Apr28_22-40-32_1bba788fe700/1682721635.780635/events.out.tfevents.1682721635.1bba788fe700.5…

Clean file runs/Apr28_22-40-32_1bba788fe700/1682721635.780635/events.out.tfevents.1682721635.1bba788fe700.545.…

Download file runs/May02_09-10-52_293fcc77bf7b/events.out.tfevents.1683018755.293fcc77bf7b.336.0: 100%|#######…

Clean file runs/May02_09-10-52_293fcc77bf7b/events.out.tfevents.1683018755.293fcc77bf7b.336.0:  18%|#7        …

Download file runs/May01_11-05-14_d5bb5466d814/events.out.tfevents.1682939130.d5bb5466d814.222.5: 100%|#######…

Clean file runs/May01_11-05-14_d5bb5466d814/events.out.tfevents.1682939130.d5bb5466d814.222.5:  18%|#7        …

Download file runs/May12_14-47-21_5e0a43f0d62d/events.out.tfevents.1683903799.5e0a43f0d62d.180.9: 100%|#######…

Download file runs/May12_14-47-21_5e0a43f0d62d/events.out.tfevents.1683903799.5e0a43f0d62d.180.10: 100%|######…

Clean file runs/May12_14-47-21_5e0a43f0d62d/events.out.tfevents.1683903799.5e0a43f0d62d.180.9:  93%|#########3…

Download file training_args.bin: 100%|##########| 3.81k/3.81k [00:00<?, ?B/s]

Download file runs/Apr28_22-34-33_1bba788fe700/events.out.tfevents.1682721280.1bba788fe700.545.4: 100%|#######…

Clean file runs/May12_14-47-21_5e0a43f0d62d/events.out.tfevents.1683903799.5e0a43f0d62d.180.10:  93%|#########…

Clean file training_args.bin:  26%|##6       | 1.00k/3.81k [00:00<?, ?B/s]

Clean file runs/Apr28_22-34-33_1bba788fe700/events.out.tfevents.1682721280.1bba788fe700.545.4:  23%|##2       …

Download file runs/May11_11-17-41_e215b90831c6/events.out.tfevents.1683807735.e215b90831c6.13012.4: 100%|#####…

Download file runs/May05_16-28-00_11b82e86c118/events.out.tfevents.1683305381.11b82e86c118.2289.2: 100%|######…

Clean file runs/May11_11-17-41_e215b90831c6/events.out.tfevents.1683807735.e215b90831c6.13012.4: 100%|########…

Clean file runs/May05_16-28-00_11b82e86c118/events.out.tfevents.1683305381.11b82e86c118.2289.2: 100%|#########…

Download file runs/May12_18-40-03_7c6169eea982/events.out.tfevents.1683917901.7c6169eea982.842.4: 100%|#######…

Clean file runs/May12_18-40-03_7c6169eea982/events.out.tfevents.1683917901.7c6169eea982.842.4: 100%|##########…

Download file runs/May12_18-40-03_7c6169eea982/events.out.tfevents.1683917901.7c6169eea982.842.5: 100%|#######…

Clean file runs/May12_18-40-03_7c6169eea982/events.out.tfevents.1683917901.7c6169eea982.842.5: 100%|##########…

Download file runs/Apr29_13-51-01_abe5a82602c7/events.out.tfevents.1682776748.abe5a82602c7.1059.2: 100%|######…

Clean file runs/Apr29_13-51-01_abe5a82602c7/events.out.tfevents.1682776748.abe5a82602c7.1059.2: 100%|#########…

Download file runs/Apr28_22-40-32_1bba788fe700/events.out.tfevents.1682722233.1bba788fe700.545.8: 100%|#######…

Clean file runs/Apr28_22-40-32_1bba788fe700/events.out.tfevents.1682722233.1bba788fe700.545.8: 100%|##########…

Download file runs/May13_15-56-31_d960191a09db/events.out.tfevents.1683994363.d960191a09db.165.4: 100%|#######…

Clean file runs/May13_15-56-31_d960191a09db/events.out.tfevents.1683994363.d960191a09db.165.4: 100%|##########…

Download file runs/May12_14-13-47_5e0a43f0d62d/events.out.tfevents.1683901264.5e0a43f0d62d.180.2: 100%|#######…

Clean file runs/May12_14-13-47_5e0a43f0d62d/events.out.tfevents.1683901264.5e0a43f0d62d.180.2: 100%|##########…

Download file runs/May13_15-56-31_d960191a09db/events.out.tfevents.1683994363.d960191a09db.165.5: 100%|#######…

Clean file runs/May13_15-56-31_d960191a09db/events.out.tfevents.1683994363.d960191a09db.165.5: 100%|##########…

Download file runs/Apr29_16-06-15_abe5a82602c7/events.out.tfevents.1682784794.abe5a82602c7.1059.8: 100%|######…

Clean file runs/Apr29_16-06-15_abe5a82602c7/events.out.tfevents.1682784794.abe5a82602c7.1059.8: 100%|#########…

Clean file pytorch_model.bin:   0%|          | 1.00k/420M [00:00<?, ?B/s]

In [9]:
# Train 
trainer.train()

You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Loc,Misc,Org,Per,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,No log,0.060167,"{'precision': 0.6894977168949772, 'recall': 0.7330097087378641, 'f1': 0.7105882352941175, 'number': 206}","{'precision': 0.8461538461538461, 'recall': 0.2972972972972973, 'f1': 0.44000000000000006, 'number': 37}","{'precision': 0.7472527472527473, 'recall': 0.7472527472527473, 'f1': 0.7472527472527473, 'number': 182}","{'precision': 0.8195876288659794, 'recall': 0.7571428571428571, 'f1': 0.787128712871287, 'number': 210}",0.751645,0.719685,0.735318,0.983015
2,0.090300,0.056796,"{'precision': 0.776255707762557, 'recall': 0.8252427184466019, 'f1': 0.7999999999999999, 'number': 206}","{'precision': 0.5217391304347826, 'recall': 0.32432432432432434, 'f1': 0.4, 'number': 37}","{'precision': 0.7731958762886598, 'recall': 0.8241758241758241, 'f1': 0.7978723404255318, 'number': 182}","{'precision': 0.822429906542056, 'recall': 0.8380952380952381, 'f1': 0.830188679245283, 'number': 210}",0.781538,0.8,0.790661,0.984508
3,0.035700,0.063118,"{'precision': 0.7339055793991416, 'recall': 0.8300970873786407, 'f1': 0.7790432801822323, 'number': 206}","{'precision': 0.6363636363636364, 'recall': 0.3783783783783784, 'f1': 0.4745762711864407, 'number': 37}","{'precision': 0.7969543147208121, 'recall': 0.8626373626373627, 'f1': 0.8284960422163589, 'number': 182}","{'precision': 0.8317757009345794, 'recall': 0.8476190476190476, 'f1': 0.839622641509434, 'number': 210}",0.780781,0.818898,0.799385,0.985061
4,0.020100,0.076075,"{'precision': 0.772093023255814, 'recall': 0.8058252427184466, 'f1': 0.7885985748218527, 'number': 206}","{'precision': 0.6, 'recall': 0.32432432432432434, 'f1': 0.4210526315789474, 'number': 37}","{'precision': 0.8263157894736842, 'recall': 0.8626373626373627, 'f1': 0.8440860215053764, 'number': 182}","{'precision': 0.8293838862559242, 'recall': 0.8333333333333334, 'f1': 0.8313539192399049, 'number': 210}",0.801887,0.80315,0.802518,0.984573
5,0.011300,0.074478,"{'precision': 0.7477876106194691, 'recall': 0.8203883495145631, 'f1': 0.7824074074074074, 'number': 206}","{'precision': 0.4666666666666667, 'recall': 0.3783783783783784, 'f1': 0.417910447761194, 'number': 37}","{'precision': 0.8229166666666666, 'recall': 0.8681318681318682, 'f1': 0.8449197860962567, 'number': 182}","{'precision': 0.8388625592417062, 'recall': 0.8428571428571429, 'f1': 0.840855106888361, 'number': 210}",0.786039,0.815748,0.800618,0.984931
6,0.011300,0.081522,"{'precision': 0.7654867256637168, 'recall': 0.8398058252427184, 'f1': 0.8009259259259259, 'number': 206}","{'precision': 0.43333333333333335, 'recall': 0.35135135135135137, 'f1': 0.3880597014925374, 'number': 37}","{'precision': 0.8253968253968254, 'recall': 0.8571428571428571, 'f1': 0.8409703504043127, 'number': 182}","{'precision': 0.8673469387755102, 'recall': 0.8095238095238095, 'f1': 0.8374384236453202, 'number': 210}",0.798752,0.806299,0.802508,0.984444
7,0.008500,0.085035,"{'precision': 0.7579908675799086, 'recall': 0.8058252427184466, 'f1': 0.7811764705882352, 'number': 206}","{'precision': 0.5416666666666666, 'recall': 0.35135135135135137, 'f1': 0.4262295081967213, 'number': 37}","{'precision': 0.828125, 'recall': 0.8736263736263736, 'f1': 0.8502673796791443, 'number': 182}","{'precision': 0.8805970149253731, 'recall': 0.8428571428571429, 'f1': 0.8613138686131387, 'number': 210}",0.809748,0.811024,0.810386,0.985288
8,0.004500,0.084552,"{'precision': 0.7321428571428571, 'recall': 0.7961165048543689, 'f1': 0.7627906976744185, 'number': 206}","{'precision': 0.4642857142857143, 'recall': 0.35135135135135137, 'f1': 0.39999999999999997, 'number': 37}","{'precision': 0.8172043010752689, 'recall': 0.8351648351648352, 'f1': 0.8260869565217392, 'number': 182}","{'precision': 0.8756218905472637, 'recall': 0.8380952380952381, 'f1': 0.856447688564477, 'number': 210}",0.790297,0.795276,0.792779,0.984703
9,0.004400,0.084464,"{'precision': 0.7614678899082569, 'recall': 0.8058252427184466, 'f1': 0.7830188679245284, 'number': 206}","{'precision': 0.48148148148148145, 'recall': 0.35135135135135137, 'f1': 0.40625, 'number': 37}","{'precision': 0.8297872340425532, 'recall': 0.8571428571428571, 'f1': 0.8432432432432433, 'number': 182}","{'precision': 0.8811881188118812, 'recall': 0.8476190476190476, 'f1': 0.8640776699029127, 'number': 210}",0.807874,0.807874,0.807874,0.985353
10,0.003100,0.085475,"{'precision': 0.7568807339449541, 'recall': 0.8009708737864077, 'f1': 0.7783018867924528, 'number': 206}","{'precision': 0.4482758620689655, 'recall': 0.35135135135135137, 'f1': 0.393939393939394, 'number': 37}","{'precision': 0.8297872340425532, 'recall': 0.8571428571428571, 'f1': 0.8432432432432433, 'number': 182}","{'precision': 0.8855721393034826, 'recall': 0.8476190476190476, 'f1': 0.8661800486618005, 'number': 210}",0.805031,0.806299,0.805665,0.98519


Trainer is attempting to log a value of "{'precision': 0.6894977168949772, 'recall': 0.7330097087378641, 'f1': 0.7105882352941175, 'number': 206}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8461538461538461, 'recall': 0.2972972972972973, 'f1': 0.44000000000000006, 'number': 37}" of type <class 'dict'> for key "eval/MISC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.7472527472527473, 'recall': 0.7472527472527473, 'f1': 0.7472527472527473, 'number': 182}" of type <class 'dict'> for key "eval/ORG" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8195876288659794, 'recall': 0.7571428571428571, 

TrainOutput(global_step=4080, training_loss=0.021873219683766366, metrics={'train_runtime': 836.0618, 'train_samples_per_second': 77.961, 'train_steps_per_second': 4.88, 'total_flos': 941158264577700.0, 'train_loss': 0.021873219683766366, 'epoch': 10.0})

In [10]:
# Evaluate
trainer.evaluate(eval_dataset=tokenized_datasets["test"])

Trainer is attempting to log a value of "{'precision': 0.7309417040358744, 'recall': 0.7546296296296297, 'f1': 0.7425968109339408, 'number': 216}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.5862068965517241, 'recall': 0.425, 'f1': 0.4927536231884058, 'number': 40}" of type <class 'dict'> for key "eval/MISC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8333333333333334, 'recall': 0.825, 'f1': 0.8291457286432161, 'number': 200}" of type <class 'dict'> for key "eval/ORG" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.7823834196891192, 'recall': 0.7704081632653061, 'f1': 0.776349614395887, 'n

{'eval_loss': 0.11765629053115845,
 'eval_LOC': {'precision': 0.7309417040358744,
  'recall': 0.7546296296296297,
  'f1': 0.7425968109339408,
  'number': 216},
 'eval_MISC': {'precision': 0.5862068965517241,
  'recall': 0.425,
  'f1': 0.4927536231884058,
  'number': 40},
 'eval_ORG': {'precision': 0.8333333333333334,
  'recall': 0.825,
  'f1': 0.8291457286432161,
  'number': 200},
 'eval_PER': {'precision': 0.7823834196891192,
  'recall': 0.7704081632653061,
  'f1': 0.776349614395887,
  'number': 196},
 'eval_overall_precision': 0.7713841368584758,
 'eval_overall_recall': 0.7607361963190185,
 'eval_overall_f1': 0.766023166023166,
 'eval_overall_accuracy': 0.9811655432644242,
 'eval_runtime': 4.3604,
 'eval_samples_per_second': 498.352,
 'eval_steps_per_second': 31.19,
 'epoch': 10.0}

In [None]:
# Upload the result of the training to HuggingFace Hub
trainer.push_to_hub(commit_message="Training complete")

Upload file pytorch_model.bin:   0%|          | 1.00/420M [00:00<?, ?B/s]

Upload file runs/May13_15-56-31_d960191a09db/events.out.tfevents.1683993500.d960191a09db.165.0:   0%|         …

Upload file runs/May13_15-56-31_d960191a09db/events.out.tfevents.1683994363.d960191a09db.165.4:   0%|         …

Upload file runs/May13_15-56-31_d960191a09db/events.out.tfevents.1683994363.d960191a09db.165.5:   0%|         …

Upload file runs/May13_15-56-31_d960191a09db/events.out.tfevents.1683993500.d960191a09db.165.2:   0%|         …

To https://huggingface.co/fgiauna/camembert-ner-finetuned-jul
   3622e40..d1f5eba  main -> main

   3622e40..d1f5eba  main -> main

To https://huggingface.co/fgiauna/camembert-ner-finetuned-jul
   d1f5eba..2ed8887  main -> main

   d1f5eba..2ed8887  main -> main



'https://huggingface.co/fgiauna/camembert-ner-finetuned-jul/commit/d1f5ebaa55c2f3117cc958bd4cd4b98ae7ac8982'