<a href="https://colab.research.google.com/github/eduardoplima/aed-lener-br/blob/main/aed_lener_br.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Configuração do ambiente

In [None]:
import pandas as pd

In [None]:
def parse_conll(filepath):
    sentences = []
    sentence = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip() == '':
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                parts = line.strip().split()
                if len(parts) >= 2:
                    token, label = parts[0], parts[-1]
                    sentence.append((token, label))
        if sentence:
            sentences.append(sentence)
    return sentences

train_data = parse_conll('train.conll')
test_data = parse_conll('test.conll')


In [None]:
train_data

[[('EMENTA', 'O'),
  (':', 'O'),
  ('APELAÇÃO', 'O'),
  ('CÍVEL', 'O'),
  ('-', 'O'),
  ('AÇÃO', 'O'),
  ('DE', 'O'),
  ('INDENIZAÇÃO', 'O'),
  ('POR', 'O'),
  ('DANOS', 'O'),
  ('MORAIS', 'O'),
  ('-', 'O'),
  ('PRELIMINAR', 'O'),
  ('-', 'O'),
  ('ARGUIDA', 'O'),
  ('PELO', 'O'),
  ('MINISTÉRIO', 'B-ORGANIZACAO'),
  ('PÚBLICO', 'I-ORGANIZACAO'),
  ('EM', 'O'),
  ('GRAU', 'O'),
  ('RECURSAL', 'O'),
  ('-', 'O'),
  ('NULIDADE', 'O'),
  ('-', 'O'),
  ('AUSÊNCIA', 'O'),
  ('DE', 'O'),
  ('INTERVENÇÃO', 'O'),
  ('DO', 'O'),
  ('PARQUET', 'O'),
  ('NA', 'O'),
  ('INSTÂNCIA', 'O'),
  ('A', 'O'),
  ('QUO', 'O'),
  ('-', 'O'),
  ('PRESENÇA', 'O'),
  ('DE', 'O'),
  ('INCAPAZ', 'O'),
  ('-', 'O'),
  ('PREJUÍZO', 'O'),
  ('EXISTENTE', 'O'),
  ('-', 'O'),
  ('PRELIMINAR', 'O'),
  ('ACOLHIDA', 'O'),
  ('-', 'O'),
  ('NULIDADE', 'O'),
  ('RECONHECIDA', 'O'),
  ('.', 'O')],
 [('-', 'O'),
  ('O', 'O'),
  ('art', 'B-LEGISLACAO'),
  ('.', 'I-LEGISLACAO'),
  ('178', 'I-LEGISLACAO'),
  (',', 'I-LEGISLACA

In [None]:
def extract_labels(filepath):
    labels = set()
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():
                parts = line.strip().split()
                if len(parts) >= 2:
                    label = parts[-1]
                    labels.add(label)
    return sorted(labels)

label_list = extract_labels('train.conll')


In [None]:
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}
num_labels = len(label_list)


In [None]:
label_list, num_labels

(['B-JURISPRUDENCIA',
  'B-LEGISLACAO',
  'B-LOCAL',
  'B-ORGANIZACAO',
  'B-PESSOA',
  'B-TEMPO',
  'I-JURISPRUDENCIA',
  'I-LEGISLACAO',
  'I-LOCAL',
  'I-ORGANIZACAO',
  'I-PESSOA',
  'I-TEMPO',
  'O'],
 13)

In [None]:
from transformers import BertForTokenClassification

model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=num_labels)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

def tokenize_and_align_labels(sentences):
    tokenized_inputs = tokenizer(
        [[token for token, label in sentence] for sentence in sentences],
        is_split_into_words=True,
        return_offsets_mapping=True,
        padding=True,
        truncation=True
    )
    labels = []
    for i, sentence in enumerate(sentences):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[sentence[word_idx][1]])
            else:
                label_ids.append(label_to_id[sentence[word_idx][1]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
from transformers import BertForTokenClassification, Trainer, TrainingArguments

model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=num_labels)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'