## Preparing the environment

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!apt install git-lfs

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

You will need to setup git, adapt your email and name in the following cell.

In [None]:
!git config --global user.email "giovanna.andrade@icomp.ufam.edu.br"
!git config --global user.name "gioandrade7"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Preprocessing the data

In [None]:
!git clone https://github.com/peluz/lener-br.git

Cloning into 'lener-br'...
remote: Enumerating objects: 314, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 314 (delta 11), reused 3 (delta 3), pack-reused 291[K
Receiving objects: 100% (314/314), 63.02 MiB | 31.69 MiB/s, done.
Resolving deltas: 100% (40/40), done.


In [None]:
dev_path = "/content/lener-br/leNER-Br/dev"
train_path = "/content/lener-br/leNER-Br/train"
test_path = "/content/lener-br/leNER-Br/test"

In [None]:
import os
dev_files = os.listdir(dev_path)
train_files = os.listdir(train_path)
test_files = os.listdir(test_path)

In [None]:
def get_texts(files, path):
  texts = []
  for file in files:
    with open(os.path.join(path, file),'r',encoding='utf8') as f:
        text = f.readlines()
        texts.append(text)
  return texts

In [None]:
dev_texts = get_texts(dev_files, dev_path)
train_texts = get_texts(train_files, train_path)
test_texts = get_texts(test_files, test_path)

In [None]:
train_texts[0]

In [None]:
import pandas as pd
def mount_dataset(texts):
  sentences = []
  words = []
  tags = []
  count_sentence = 1
  count_line = 0
  for text in texts:
    for line in text:
      if(line == "\n"):
        count_sentence += 1
      else:
        split = line.strip().split(" ")
        words.append(split[0])
        tags.append(split[1])
        sentences.append(count_sentence)
  df = pd.DataFrame({
      'Sentence #': sentences,
      'Word': words,
      'Tag': tags
  })

  return df

In [None]:
dev_df = mount_dataset(dev_texts)
train_df = mount_dataset(train_texts)
test_df = mount_dataset(test_texts)

In [None]:
dev_df.head(15)

Unnamed: 0,Sentence #,Word,Tag
0,1,Documento,O
1,1,assinado,O
2,1,eletronicamente,O
3,1,por,O
4,1,José,B-PESSOA
5,1,Barroso,I-PESSOA
6,1,Filho,I-PESSOA
7,1,",",O
8,1,Matricula,O
9,1,1117,O


In [None]:
all_df = pd.concat([dev_df, train_df, test_df], axis = 0)
all_df

Unnamed: 0,Sentence #,Word,Tag
0,1,Documento,O
1,1,assinado,O
2,1,eletronicamente,O
3,1,por,O
4,1,José,B-PESSOA
...,...,...,...
95255,2778,de,O
95256,2778,Chaves,O
95257,2778,Públicas,O
95258,2778,Brasileira,O


In [None]:
tags = list(set(all_df['Tag']))
tags = [i for i in tags if i != "O"]
tags.insert(0, "O")
tags

['O',
 'B-PESSOA',
 'I-PESSOA',
 'B-JURISPRUDENCIA',
 'B-LEGISLACAO',
 'B-TEMPO',
 'B-LOCAL',
 'I-LOCAL',
 'I-LEGISLACAO',
 'B-ORGANIZACAO',
 'I-ORGANIZACAO',
 'I-JURISPRUDENCIA',
 'I-TEMPO']

In [None]:
idx2tag = {k:v for k,v in enumerate(tags)}
idx2tag

{0: 'O',
 1: 'B-PESSOA',
 2: 'I-PESSOA',
 3: 'B-JURISPRUDENCIA',
 4: 'B-LEGISLACAO',
 5: 'B-TEMPO',
 6: 'B-LOCAL',
 7: 'I-LOCAL',
 8: 'I-LEGISLACAO',
 9: 'B-ORGANIZACAO',
 10: 'I-ORGANIZACAO',
 11: 'I-JURISPRUDENCIA',
 12: 'I-TEMPO'}

In [None]:
tag2idx = {v:k for k,v in enumerate(tags)}
tag2idx

{'O': 0,
 'B-PESSOA': 1,
 'I-PESSOA': 2,
 'B-JURISPRUDENCIA': 3,
 'B-LEGISLACAO': 4,
 'B-TEMPO': 5,
 'B-LOCAL': 6,
 'I-LOCAL': 7,
 'I-LEGISLACAO': 8,
 'B-ORGANIZACAO': 9,
 'I-ORGANIZACAO': 10,
 'I-JURISPRUDENCIA': 11,
 'I-TEMPO': 12}

In [None]:
def apply_tag(df):
  df['Tag'] = df['Tag'].apply(lambda x: tag2idx[x])

In [None]:
apply_tag(dev_df)
apply_tag(train_df)
apply_tag(test_df)

In [None]:
dev_df.head(15)

Unnamed: 0,Sentence #,Word,Tag
0,1,Documento,0
1,1,assinado,0
2,1,eletronicamente,0
3,1,por,0
4,1,José,1
5,1,Barroso,2
6,1,Filho,2
7,1,",",0
8,1,Matricula,0
9,1,1117,0


In [None]:
from transformers import AutoTokenizer

model_checkpoint = "neuralmind/bert-base-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
tokenizer.save_pretrained("./tokenizer")

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')

In [None]:
inputs = tokenizer(list(train_df[train_df["Sentence #"] == 1]["Word"]), is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'P',
 '##O',
 '##DE',
 '##R',
 'J',
 '##U',
 '##DI',
 '##CI',
 '##Á',
 '##RI',
 '##O',
 'DO',
 'ES',
 '##TA',
 '##DO',
 'DO',
 'AC',
 '##RE',
 'Segunda',
 'Câmara',
 'Cí',
 '##vel',
 '1',
 'End',
 '##ere',
 '##ço',
 ':',
 'Rua',
 'Tribunal',
 'de',
 'Justiça',
 ',',
 's',
 '/',
 'n',
 ',',
 'Via',
 'Verde',
 ',',
 'CE',
 '##P',
 '69',
 '.',
 '91',
 '##5',
 '-',
 '63',
 '##1',
 ',',
 'Tel',
 '.',
 '68',
 '33',
 '##0',
 '##2',
 '-',
 '04',
 '##44',
 '/',
 '04',
 '##45',
 ',',
 'Rio',
 'Branco',
 '-',
 'AC',
 '-',
 'Mod',
 '.',
 '500',
 '##25',
 '##8',
 '-',
 'Auto',
 '##s',
 'n',
 '.',
 '[UNK]',
 '100',
 '##21',
 '##99',
 '-',
 '81',
 '.',
 '2017',
 '.',
 '8',
 '.',
 '01',
 '.',
 '000',
 '##0',
 '/',
 '500',
 '##00',
 'Ac',
 '##ór',
 '##dão',
 'n',
 '.',
 ':',
 '5',
 '.',
 '58',
 '##5',
 'Classe',
 ':',
 'Emb',
 '##ar',
 '##gos',
 'de',
 'Declaração',
 'n',
 '.',
 '100',
 '##21',
 '##99',
 '-',
 '81',
 '.',
 '2017',
 '.',
 '8',
 '.',
 '01',
 '.',
 '000',
 '##0',
 '/',
 '500',


In [None]:
word_ids = inputs.word_ids()

In [None]:

word_ids

[None,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 3,
 3,
 3,
 4,
 5,
 5,
 6,
 7,
 8,
 8,
 9,
 10,
 10,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 17,
 17,
 18,
 19,
 20,
 21,
 22,
 22,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 24,
 25,
 25,
 26,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 28,
 29,
 30,
 30,
 30,
 31,
 32,
 33,
 34,
 34,
 34,
 35,
 36,
 36,
 37,
 37,
 37,
 38,
 38,
 38,
 38,
 38,
 38,
 38,
 38,
 38,
 38,
 38,
 38,
 38,
 38,
 38,
 38,
 38,
 39,
 39,
 39,
 40,
 40,
 41,
 42,
 42,
 42,
 42,
 43,
 44,
 45,
 45,
 45,
 46,
 47,
 48,
 49,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 50,
 51,
 51,
 52,
 53,
 54,
 55,
 55,
 55,
 56,
 57,
 58,
 59,
 59,
 60,
 60,
 61,
 62,
 62,
 62,
 62,
 63,
 64,
 65,
 65,
 65,
 66,
 67,
 67,
 67,
 67,
 67,
 68,
 68,
 68,
 68,
 68,
 68,
 69,
 69,
 69,
 69,
 70,
 70,
 71,
 72,
 72,
 73,
 74,
 75,
 75,
 76,
 77,
 78,
 78,
 79,
 80,
 80,
 80,
 80,
 81,
 82,
 82,
 82,
 83,
 84,
 84,
 84,
 85,
 85,
 86,
 86,
 86,
 

In [None]:
#assings each token to a tag

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
labels = list(train_df[train_df["Sentence #"] == 1]["Tag"])
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 6, 7, 7, 9, 10, 10, 0, 0, 0, 6, 7, 7, 7, 0, 0, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 0, 0, 0, 0, 0, 3, 11, 11, 3, 11, 11, 11, 0, 0, 3, 11, 11, 11, 11, 11, 0, 0, 6, 7, 0, 0, 9, 10, 10, 0, 0, 0, 1, 2, 0, 0, 1, 2, 2, 0, 0, 1, 2, 2, 2, 2, 0, 9, 0, 0, 0, 0, 0, 9, 10, 10, 0]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 7, 7, 8, 9, 10, 10, 10, 0, 0, 0, 0, 0, 6, 7, 7, 7, 0, 0, 0, 0, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, 8, 0, 0, 0, 0, 0, 0, 0, 3, 4, 11, 12, 12, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 3, 4, 4, 11, 12, 11, 11, 12, 12, 12, 0, 0, 3, 4, 4, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, 0, 0, 6, 7, 0, 0, 0, 0, 9, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 0, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 10, 10, 10, 10, 10, 10, 10, 0, -100]


In [None]:
def tokenize_and_align_labels(df):
  sentences = df["Sentence #"].unique()
  tokens = []
  tags = []
  for i in sentences:
    tokens.append(list(df[df["Sentence #"] == sentences[i-1]]["Word"]))
    tags.append(list(df[df["Sentence #"] == sentences[i-1]]["Tag"]))

  tokenized_inputs = tokenizer(
        tokens, truncation=True, is_split_into_words=True
    )

  all_labels = tags
  new_labels = []

  for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

  tokenized_inputs["labels"] = new_labels
  return tokenized_inputs


In [None]:
train_tokenized_inputs = tokenize_and_align_labels(train_df)
dev_tokenized_inputs = tokenize_and_align_labels(dev_df)
test_tokenized_inputs = tokenize_and_align_labels(test_df)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


## Fine-tuning the model

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, return_tensors="tf"
)

In [None]:
from datasets import Dataset
tokenized_datasets_train = Dataset.from_dict(train_tokenized_inputs)
tokenized_datasets_dev = Dataset.from_dict(dev_tokenized_inputs)
tokenized_datasets_test = Dataset.from_dict(test_tokenized_inputs)

In [None]:
tf_train_dataset = tokenized_datasets_train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=4,
)

tf_eval_dataset = tokenized_datasets_dev.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=4,
)

### Defining the model

In [None]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=idx2tag,
    label2id=tag2idx,
)

All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#number of labels
model.config.num_labels

13

### Fine-tuning the model

In [None]:
from transformers import create_optimizer
import tensorflow as tf

# Train in mixed-precision float16
# Comment this line out if you're using a GPU that will not benefit from this
#tf.keras.mixed_precision.set_global_policy("mixed_float16")

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

In [None]:
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(hub_model_id ="gioandrade/lener_bert", output_dir="./lener_bert",  tokenizer=tokenizer)

model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    callbacks=[callback],
    epochs=num_epochs,
)

/content/lener_bert is already a clone of https://huggingface.co/gioandrade/lener_bert. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch 1/3
Epoch 2/3

Several commits (2) will be pushed upstream.


Epoch 3/3

Several commits (3) will be pushed upstream.




<keras.src.callbacks.History at 0x7b071818f8b0>

In [None]:
!cp -r "/content/lener_bert" "/content/drive/MyDrive/projeto_giovanna_jusbrasil/Modelos"

In [None]:
model.save(filepath="/content/drive/MyDrive/projeto_giovanna_jusbrasil/Modelos/lener_BERT_saved_model_format")

### metrics

In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=7dd26077168ce080594842159f6d309980415cd073cb91f26d4129c5d19d4943
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
labels = list(train_df[train_df["Sentence #"] == 1]["Tag"])
labels = list(map(lambda x: idx2tag[x], labels))
labels

['O',
 'O',
 'O',
 'B-LOCAL',
 'I-LOCAL',
 'I-LOCAL',
 'B-ORGANIZACAO',
 'I-ORGANIZACAO',
 'I-ORGANIZACAO',
 'O',
 'O',
 'O',
 'B-LOCAL',
 'I-LOCAL',
 'I-LOCAL',
 'I-LOCAL',
 'O',
 'O',
 'O',
 'B-LOCAL',
 'I-LOCAL',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-LOCAL',
 'I-LOCAL',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-JURISPRUDENCIA',
 'I-JURISPRUDENCIA',
 'I-JURISPRUDENCIA',
 'B-JURISPRUDENCIA',
 'I-JURISPRUDENCIA',
 'I-JURISPRUDENCIA',
 'I-JURISPRUDENCIA',
 'O',
 'O',
 'B-JURISPRUDENCIA',
 'I-JURISPRUDENCIA',
 'I-JURISPRUDENCIA',
 'I-JURISPRUDENCIA',
 'I-JURISPRUDENCIA',
 'I-JURISPRUDENCIA',
 'O',
 'O',
 'B-LOCAL',
 'I-LOCAL',
 'O',
 'O',
 'B-ORGANIZACAO',
 'I-ORGANIZACAO',
 'I-ORGANIZACAO',
 'O',
 'O',
 'O',
 'B-PESSOA',
 'I-PESSOA',
 'O',
 'O',
 'B-PESSOA',
 'I-PESSOA',
 'I-PESSOA',
 'O',
 'O',
 'B-PESSOA',
 'I-PESSOA',
 'I-PESSOA',
 'I-PESSOA',
 'I-PESSOA',
 'O',
 'B-ORGANIZACAO',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ORGANIZACAO',
 'I-ORGANIZACAO',
 'I-ORGANIZACAO',
 'O']

In [None]:
predictions = labels.copy()
predictions[3] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'JURISPRUDENCIA': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 3},
 'LOCAL': {'precision': 0.8,
  'recall': 0.8,
  'f1': 0.8000000000000002,
  'number': 5},
 'ORGANIZACAO': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 4},
 'PESSOA': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 3},
 'overall_precision': 0.9333333333333333,
 'overall_recall': 0.9333333333333333,
 'overall_f1': 0.9333333333333333,
 'overall_accuracy': 0.9886363636363636}

In [None]:
import numpy as np

all_predictions = []
all_labels = []
for batch in tf_eval_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(tags[predicted_idx])
            all_labels.append(tags[label_idx])
metric.compute(predictions=[all_predictions], references=[all_labels])

{'JURISPRUDENCIA': {'precision': 0.7628083491461101,
  'recall': 0.7701149425287356,
  'f1': 0.7664442326024786,
  'number': 1044},
 'LEGISLACAO': {'precision': 0.7844588344125809,
  'recall': 0.8,
  'f1': 0.7921531994395142,
  'number': 2120},
 'LOCAL': {'precision': 0.9318734793187348,
  'recall': 0.9720812182741116,
  'f1': 0.9515527950310559,
  'number': 1576},
 'ORGANIZACAO': {'precision': 0.8165289256198347,
  'recall': 0.8805704099821747,
  'f1': 0.8473413379073756,
  'number': 1122},
 'PESSOA': {'precision': 0.9647435897435898,
  'recall': 0.9709677419354839,
  'f1': 0.9678456591639872,
  'number': 620},
 'TEMPO': {'precision': 0.7615131578947368,
  'recall': 0.774247491638796,
  'f1': 0.7678275290215588,
  'number': 1196},
 'overall_precision': 0.8278128950695323,
 'overall_recall': 0.8528262568377182,
 'overall_f1': 0.840133435976392,
 'overall_accuracy': 0.9742788791198744}

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "/content/drive/MyDrive/projeto_giovanna_jusbrasil/Modelos/lener_bert"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("Consignou que o artigo 71, § 1º, da Lei nº 8.666/93 deve ser interpretado em consoância com o disposto no artigo 54 da mesma lei.")

Some layers from the model checkpoint at /content/drive/MyDrive/projeto_giovanna_jusbrasil/Modelos/lener_bert were not used when initializing TFBertForTokenClassification: ['dropout_75']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at /content/drive/MyDrive/projeto_giovanna_jusbrasil/Modelos/lener_bert.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without furthe

[{'entity_group': 'LEGISLACAO',
  'score': 0.99605024,
  'word': 'artigo 71, § 1º, da Lei nº 8. 666 / 93',
  'start': 16,
  'end': 51},
 {'entity_group': 'LEGISLACAO',
  'score': 0.75976,
  'word': 'artigo 54',
  'start': 106,
  'end': 115}]

## ONNX

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd /content/gdrive/MyDrive/projeto_giovanna_jusbrasil/Modelos

/content/gdrive/.shortcut-targets-by-id/1Dwq1hP1fQP_-7KaGuLiQH1JJL63KZVCG/projeto_giovanna_jusbrasil/Modelos


In [None]:
!ls

bert_base_ner	   lener_bert			  models	    scripts
bilstm_leNer	   lener_BERT.ipynb		  __pycache__	    tokenizer
bi-lstm-unk-words  lener_BERT_saved_model_format  resultados.ipynb


In [None]:
!pip uninstall onnxruntime
!pip uninstall optimum
!pip uninstall onnx
!pip uninstall numpy

!pip install numpy==1.23.5
!pip install onnxruntime==1.17.0
!pip install optimum==1.16.2
!pip install onnx==1.15.0


Found existing installation: onnxruntime 1.17.0
Uninstalling onnxruntime-1.17.0:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/onnxruntime-1.17.0.dist-info/*
    /usr/local/lib/python3.10/dist-packages/onnxruntime/*
Proceed (Y/n)? y
  Successfully uninstalled onnxruntime-1.17.0
Found existing installation: optimum 1.16.2
Uninstalling optimum-1.16.2:
  Would remove:
    /usr/local/bin/optimum-cli
    /usr/local/lib/python3.10/dist-packages/optimum-1.16.2.dist-info/*
    /usr/local/lib/python3.10/dist-packages/optimum/bettertransformer/*
    /usr/local/lib/python3.10/dist-packages/optimum/commands/*
    /usr/local/lib/python3.10/dist-packages/optimum/configuration_utils.py
    /usr/local/lib/python3.10/dist-packages/optimum/conftest.py
    /usr/local/lib/python3.10/dist-packages/optimum/exporters/*
    /usr/local/lib/python3.10/dist-packages/optimum/fx/*
    /usr/local/lib/python3.10/dist-packages/optimum/gptq/*
    /usr/local/lib/python3.10/dist-packages/optimum/modeling_b

Collecting onnxruntime==1.17.0
  Using cached onnxruntime-1.17.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.8 MB)
Installing collected packages: onnxruntime
Successfully installed onnxruntime-1.17.0
Collecting optimum==1.16.2
  Using cached optimum-1.16.2-py3-none-any.whl (402 kB)
Installing collected packages: optimum
Successfully installed optimum-1.16.2
Collecting onnx==1.15.0
  Downloading onnx-1.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: onnx
Successfully installed onnx-1.15.0


In [None]:
# !pip install onnx
# !pip install onnxruntime
# !pip install optimum
!pip install tqdm
import transformers



In [None]:
!pip install tf2onnx

Collecting tf2onnx
  Downloading tf2onnx-1.16.1-py3-none-any.whl (455 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m455.8/455.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tf2onnx
Successfully installed tf2onnx-1.16.1


In [None]:
!python -m scripts.convert --quantize --model_id lener_bert/ --task token-classification

2024-03-08 02:48:33.134469: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-08 02:48:33.134528: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-08 02:48:33.136372: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Framework not specified. Using tf to export to ONNX.
2024-03-08 02:48:47.628404: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
Some layers from the model checkpoint at lener_bert/ were not used when initializing TFBertForTo

In [None]:
!optimum-cli export onnx --model lener_bert/ models_onnx --task token-classification

2024-03-08 02:50:38.013064: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-08 02:50:38.013119: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-08 02:50:38.014467: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Framework not specified. Using tf to export to ONNX.
2024-03-08 02:50:41.890444: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
Some layers from the model checkpoint at lener_bert/ were not used when initializing TFBertForTo

## Metrics

In [None]:
import pandas as pd

In [None]:
metrics = {'JURISPRUDENCIA': {'precision': 0.7628083491461101,
  'recall': 0.7701149425287356,
  'f1': 0.7664442326024786,
  'number': 1044},
 'LEGISLACAO': {'precision': 0.7844588344125809,
  'recall': 0.8,
  'f1': 0.7921531994395142,
  'number': 2120},
 'LOCAL': {'precision': 0.9318734793187348,
  'recall': 0.9720812182741116,
  'f1': 0.9515527950310559,
  'number': 1576},
 'ORGANIZACAO': {'precision': 0.8165289256198347,
  'recall': 0.8805704099821747,
  'f1': 0.8473413379073756,
  'number': 1122},
 'PESSOA': {'precision': 0.9647435897435898,
  'recall': 0.9709677419354839,
  'f1': 0.9678456591639872,
  'number': 620},
 'TEMPO': {'precision': 0.7615131578947368,
  'recall': 0.774247491638796,
  'f1': 0.7678275290215588,
  'number': 1196}}

metrics = {
    'entidade': ['JURISPRUDENCIA', 'LEGISLACAO', 'LOCAL', 'ORGANIZACAO', 'PESSOA', 'TEMPO'],
    'precisão': [0.7628083491461101, 0.7844588344125809, 0.9318734793187348, 0.8165289256198347, 0.9647435897435898, 0.7615131578947368],
    'revocação': [0.7701149425287356, 0.7844588344125809, 0.9720812182741116, 0.8805704099821747, 0.9709677419354839, 0.774247491638796],
    'f1_score': [0.7664442326024786, 0.7921531994395142, 0.9515527950310559, 0.8473413379073756, 0.9678456591639872, 0.7678275290215588,]
}

df = pd.DataFrame(metrics)

In [None]:
df

Unnamed: 0,entidade,precisão,revocação,f1_score
0,JURISPRUDENCIA,0.762808,0.770115,0.766444
1,LEGISLACAO,0.784459,0.784459,0.792153
2,LOCAL,0.931873,0.972081,0.951553
3,ORGANIZACAO,0.816529,0.88057,0.847341
4,PESSOA,0.964744,0.970968,0.967846
5,TEMPO,0.761513,0.774247,0.767828


In [None]:
df.f1_score

0    0.766444
1    0.792153
2    0.951553
3    0.847341
4    0.967846
5    0.767828
Name: f1_score, dtype: float64

In [None]:
import plotly.express as px

fig = px.bar(df, x="entidade", y=["precisão", "revocação", "f1_score"], title="Métricas por entidade", barmode='group')
fig.update_layout(font=dict(size=17))
fig.show()