# Evaluating Camembert-ner on JuL lyrics

---



## Installation

In [None]:
! pip install datasets transformers accelerate evaluate seqeval # HuggingFace 🤗
! pip install sentencepiece # Required for Camembert-ner (slow tokenizer)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m65.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m789.4 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━

## Libraries

In [None]:
import numpy as np
from datasets import DatasetDict, Dataset
import transformers
from transformers import (AutoTokenizer, 
                          AutoModelForTokenClassification, 
                          Trainer, 
                          DataCollatorForTokenClassification,
                          pipeline)
import evaluate

## Functions

In [None]:
def iob_to_dataset(lines, split):
  ''' 
  Function to convert each line of a txt file in the IOB format 
  into the format expected by camembert-ner and HuggingFace dataset 
  '''
  # Define tag to ID mapping
  tag2id = {'O': 0, 'LOC': 1, 'PER': 2, 'MISC': 3, 'ORG': 4}

  # Group IOB-formatted lines into sentences
  sentences = []
  sentence = []
  for line in lines:
    line = line.strip()
    if line:
      token, tag = line.split()
      sentence.append((token, tag))
    else:
      sentences.append(sentence)
      sentence = []
  if sentence:
    sentences.append(sentence)

  # Merge tokens and NER tags for each sentence
  tokens = []
  ner_tags = []
  for sentence in sentences:
    sentence_tokens, sentence_tags = zip(*sentence)
    tokens.append(' '.join(sentence_tokens))
    # Remove IOB tag prefixes for camembert-ner
    ner_tags.append([tag2id[tag.replace('B-', '').replace('I-', '')] for tag in sentence_tags])

  # Create a dictionary
  dataset_dict = {"id": list(range(len(tokens))),
                  "tokens": tokens,
                  "ner_tags": ner_tags}

  # Return the dataset as a Hugging Face Dataset object
  return Dataset.from_dict(dataset_dict)



def tokenize_and_align_labels(examples):
  '''
  Function to align labels with token ids
  '''
  label_all_tokens = True
  tokenized_inputs = tokenizer(examples["tokens"], 
                               truncation=True)

  labels = []
  for i, label in enumerate(examples["ner_tags"]):
    word_ids = tokenized_inputs.word_ids(batch_index=i)
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
      # Set labels of special tokens to -100 (index ignored by PyTorch)
      if word_idx is None:
        label_ids.append(-100)
      # Set label for the first token of each word
      elif word_idx != previous_word_idx:
        label_ids.append(label[word_idx])
      # Set the label to either the current label or -100
      else:
        label_ids.append(label[word_idx] if label_all_tokens else -100)
      previous_word_idx = word_idx

    labels.append(label_ids)

  tokenized_inputs["labels"] = labels
  return tokenized_inputs



def compute_metrics(p):
  '''
  Function to compute metrics on predictions
  '''
  predictions, labels = p
  predictions = np.argmax(predictions, axis=2)

  # camembert-ner needs tags without prefixes but seqeval needs it so we add "I-"
  label_list = ['O', 'I-LOC', 'I-PER', 'I-MISC', 'I-ORG']

  # Remove ignored index (special tokens)
  true_predictions = [
      [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)]
  true_labels = [
      [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)]

  results = seqeval.compute(predictions=true_predictions, references=true_labels)
  return results

## Load files

In [None]:
#Import file
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving data.txt to data.txt
User uploaded file "data.txt" with length 737193 bytes


In [None]:
# Load data
with open("/content/data.txt", "r", encoding="utf-8") as f:
    data_lines = f.readlines()

## Preprocessing

In [None]:
# Convert IOB formatted file into the format required
data = iob_to_dataset(data_lines, "data")

# Create a DatasetDict object
dataset = DatasetDict({"data": data})

# Split data into train, valid and test sets
ds_train_devtest = dataset["data"].train_test_split(test_size=0.4, train_size=0.6, seed=7)
ds_devtest = ds_train_devtest["test"].train_test_split(test_size=0.5, seed=7)

datasets = DatasetDict({"train": ds_train_devtest["train"], # 60%
                        "valid": ds_devtest["train"], # 20%
                        "test": ds_devtest["test"]}) # 20%

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")

# Tokenize and align labels of train, validation and test sets
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)
tokenized_datasets

Downloading (…)okenizer_config.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/892 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

Map:   0%|          | 0/6518 [00:00<?, ? examples/s]

Map:   0%|          | 0/2173 [00:00<?, ? examples/s]

Map:   0%|          | 0/2173 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 6518
    })
    valid: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2173
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2173
    })
})

## Evaluating the model

In [None]:
# Label list
label_list = ['O', 'LOC', 'PER', 'MISC', 'ORG'] # {'O': 0, 'LOC': 1, 'PER': 2, 'MISC': 3, 'ORG': 4}

# Load model
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner", num_labels=5)

# Batch processed examples together while applying padding to make them the same size
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Load seqeval metric commonly used to evaluate results on CONLL
seqeval = evaluate.load('seqeval')

# Load trainer
trainer = Trainer(model,
                  train_dataset=tokenized_datasets["train"],
                  eval_dataset=tokenized_datasets["test"],
                  data_collator=data_collator,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

# Evaluate
trainer.evaluate()

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Trainer is attempting to log a value of "{'precision': 0.30845771144278605, 'recall': 0.28703703703703703, 'f1': 0.2973621103117506, 'number': 216}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.027377521613832854, 'recall': 0.475, 'f1': 0.051771117166212535, 'number': 40}" of type <class 'dict'> for key "eval/MISC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.4044943820224719, 'recall': 0.18, 'f1': 0.2491349480968858, 'number': 200}" of type <class 'dict'> for key "eval/ORG" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.3169811320754717, 'recall': 0.42857142857142855, 'f1': 0.3644251626898

{'eval_loss': 0.4447633922100067,
 'eval_LOC': {'precision': 0.30845771144278605,
  'recall': 0.28703703703703703,
  'f1': 0.2973621103117506,
  'number': 216},
 'eval_MISC': {'precision': 0.027377521613832854,
  'recall': 0.475,
  'f1': 0.051771117166212535,
  'number': 40},
 'eval_ORG': {'precision': 0.4044943820224719,
  'recall': 0.18,
  'f1': 0.2491349480968858,
  'number': 200},
 'eval_PER': {'precision': 0.3169811320754717,
  'recall': 0.42857142857142855,
  'f1': 0.3644251626898048,
  'number': 196},
 'eval_overall_precision': 0.16092874299439552,
 'eval_overall_recall': 0.30828220858895705,
 'eval_overall_f1': 0.21146764860599684,
 'eval_overall_accuracy': 0.9064082303995872,
 'eval_runtime': 8.2933,
 'eval_samples_per_second': 262.019,
 'eval_steps_per_second': 32.798}