In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! pip -q install spacy skweak

In [3]:
! python -m spacy download pt_core_news_lg

Collecting pt-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_lg-3.8.0/pt_core_news_lg-3.8.0-py3-none-any.whl (568.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m568.2/568.2 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
import pandas as pd
import re
from tqdm import tqdm

import spacy
import skweak.heuristics
import skweak.aggregation
import skweak.generative
from spacy.matcher import PhraseMatcher

In [5]:
nlp = spacy.load("pt_core_news_lg")

In [6]:
df_train = pd.read_parquet('/content/drive/MyDrive/DCAI-NER-weak_supervision/data/train.parquet')
df_dev = pd.read_parquet('/content/drive/MyDrive/DCAI-NER-weak_supervision/data/dev.parquet')
df_test = pd.read_parquet('/content/drive/MyDrive/DCAI-NER-weak_supervision/data/test.parquet')

In [7]:
df_train.head(3)

Unnamed: 0,sentences,tokens,ner_tokens
0,"sala das sessões , em de de 2019 .","[sala, das, sessões, ,, em, de, de, 2019, .]","[O, O, O, O, O, O, O, O, O]"
1,da decisão de que trata o § 12 deste artigo ca...,"[da, decisão, de, que, trata, o, §, 12, deste,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"projeto de lei n.º , de 2017 ( do sr .","[projeto, de, lei, n.º, ,, de, 2017, (, do, sr...","[O, O, O, O, O, O, O, O, O, O, O]"


### Processe os textos de cada DataFrame para criar listas de Docs spaCy

In [8]:
docs_train = list(tqdm(nlp.pipe(df_train['sentences'].astype(str)), total=len(df_train)))
docs_dev = list(tqdm(nlp.pipe(df_dev['sentences'].astype(str)), total=len(df_dev)))
docs_test = list(tqdm(nlp.pipe(df_test['sentences'].astype(str)), total=len(df_test)))

100%|██████████| 1760/1760 [00:09<00:00, 194.04it/s]
100%|██████████| 140/140 [00:00<00:00, 144.05it/s]
100%|██████████| 592/592 [00:04<00:00, 147.99it/s]


In [9]:
import unicodedata

def clean_text(text):
    text = unicodedata.normalize('NFKD', text)
    text = ''.join([c for c in text if not unicodedata.combining(c)])
    text = text.lower()

    return text

original_text = "Ação do Deputado João Müller na Câmãrà!"
cleaned_text = clean_text(original_text)
print(cleaned_text)

acao do deputado joao muller na camara!


### Limpando dados

In [10]:
df_train.tokens = df_train.tokens.apply(lambda x: [clean_text(token) for token in x])
df_train.sentences = df_train.sentences.apply(lambda x: [clean_text(sentence) for sentence in x])
df_train.sentences = df_train.sentences.apply(lambda x: ''.join(x))

In [11]:
df_dev.tokens = df_dev.tokens.apply(lambda x: [clean_text(token) for token in x])
df_dev.sentences = df_dev.sentences.apply(lambda x: [clean_text(sentence) for sentence in x])
df_dev.sentences = df_dev.sentences.apply(lambda x: ''.join(x))

In [12]:
df_test.tokens = df_test.tokens.apply(lambda x: [clean_text(token) for token in x])
df_test.sentences = df_test.sentences.apply(lambda x: [clean_text(sentence) for sentence in x])
df_test.sentences = df_test.sentences.apply(lambda x: ''.join(x))

### Adicionando uma coluna que alinhe os rótulos NER e os tokens

In [14]:
df_train['tokens_w_ner'] = df_train.apply(lambda r: list(zip(r.tokens, r.ner_tokens)), axis=1)
df_dev['tokens_w_ner'] = df_dev.apply(lambda r: list(zip(r.tokens, r.ner_tokens)), axis=1)
df_test['tokens_w_ner'] = df_test.apply(lambda r: list(zip(r.tokens, r.ner_tokens)), axis=1)

### Processando as sentenças em Docs

In [13]:
docs_train = list(tqdm(nlp.pipe(df_train['sentences'].astype(str)), total=len(df_train)))
docs_dev = list(tqdm(nlp.pipe(df_dev['sentences'].astype(str)), total=len(df_dev)))
docs_test = list(tqdm(nlp.pipe(df_test['sentences'].astype(str)), total=len(df_test)))

100%|██████████| 1760/1760 [00:11<00:00, 149.09it/s]
100%|██████████| 140/140 [00:01<00:00, 139.33it/s]
100%|██████████| 592/592 [00:04<00:00, 125.03it/s]


In [14]:
'''
spacy_docs_train = nlp_sm.pipe(df_train.sentences.values)
skweak.utils.docbin_writer(spacy_docs_train, "./spacy/spacy_pt_core_news_sm_docs_train.bin")

spacy_docs_dev = nlp_sm.pipe(df_dev.sentences.values)
skweak.utils.docbin_writer(spacy_docs_dev, "./spacy/spacy_pt_core_news_sm_docs_dev.bin")

spacy_docs_test = nlp_sm.pipe(df_test.sentences.values)
skweak.utils.docbin_writer(spacy_docs_test, "./spacy/spacy_pt_core_news_sm_docs_test.bin")
''';

### Carregando os dados para ter um dicionário de nomes


In [15]:
df_names = pd.read_csv('/content/drive/MyDrive/DCAI-NER-weak_supervision/data/nomes.csv')

In [16]:
fst_names = df_names.nome.to_list()
fst_names[:5]

['abdenor', 'abdias', 'abdias', 'abel', 'abi']

In [17]:
df_names.sobrenome = df_names.sobrenome.astype(str)
snd_names = df_names.sobrenome.to_list()
snd_names = (' ').join(snd_names).split(' ')
snd_names = [x for x in snd_names if x not in ['da', 'de', 'do', 'das', 'dos', 'e', 'o', 'a', 'as', 'os']]
snd_names = list(set(snd_names))
snd_names = [x for x in snd_names if len(x) > 1]
snd_names[:5]

['aleffe', 'alyne', 'iracy', 'bias', 'torsello']

In [18]:
people_names = fst_names + snd_names
people_names = list(set(people_names))
people_names[:5]

['etaniel', 'karioláine', 'diosleicy', 'inireide', 'aleffe']

In [19]:
people_names = [clean_text(str(name)) for name in people_names]
people_names[:5]

['etaniel', 'kariolaine', 'diosleicy', 'inireide', 'aleffe']

### Criando label functions

In [20]:
name_matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
patterns = [nlp.make_doc(name) for name in people_names]
name_matcher.add("PEOPLE_NAMES", patterns)

In [21]:
def f_dictionary_names(doc):
    for _, start, end in name_matcher(doc):
        yield start, end, "PESSOA"

def f_propn_simple(doc):
    for token in doc:
        if token.pos_ == "PROPN" and token.is_title:
            yield token.i, token.i + 1, "PESSOA"

def f_spacy_model(doc):
    for ent in doc.ents:
        if ent.label_ == "PER":
            yield ent.start, ent.end, "PESSOA"

lf1_annotator = skweak.heuristics.FunctionAnnotator("lf_dictionary_names", f_dictionary_names)
lf2_annotator = skweak.heuristics.FunctionAnnotator("lf_propn_simple", f_propn_simple)
lf3_annotator = skweak.heuristics.FunctionAnnotator("lf_spacy_model", f_spacy_model)

annotators = [lf1_annotator, lf2_annotator, lf3_annotator]

### Aplicando as label functions

In [23]:
docs_train_annotated = docs_train.copy()
docs_dev_annotated = docs_dev.copy()
docs_test_annotated = docs_test.copy()

for annotator in annotators:
    docs_train_annotated = list(annotator.pipe(docs_train_annotated))
    docs_dev_annotated = list(annotator.pipe(docs_dev_annotated))
    docs_test_annotated = list(annotator.pipe(docs_test_annotated))

### Treinando e aplicando o modelo HMM

In [24]:
hmm_model = skweak.generative.HMM("hmm", labels=["PESSOA"])
hmm_model.fit(docs_train_annotated)

Starting iteration 1
Number of processed documents: 1000
Finished E-step with 1704 documents
Starting iteration 2


         1  -40385.85959207             +nan


Number of processed documents: 1000
Finished E-step with 1704 documents
Starting iteration 3


         2  -38084.62019371   +2301.23939836


Number of processed documents: 1000
Finished E-step with 1704 documents
Starting iteration 4


         3  -37567.18756351    +517.43263021


Number of processed documents: 1000
Finished E-step with 1704 documents


         4  -37448.21529329    +118.97227021


In [25]:
docs_train_hmm = list(hmm_model.pipe(docs_train_annotated))
docs_dev_hmm = list(hmm_model.pipe(docs_dev_annotated))
docs_test_hmm = list(hmm_model.pipe(docs_test_annotated))

In [34]:
from spacy import displacy

doc_exemplo = docs_train_hmm[0]
fonte_a_visualizar = "hmm"

doc_para_visualizar = doc_exemplo.copy()
doc_para_visualizar.ents = doc_para_visualizar.spans[fonte_a_visualizar]

displacy.render(doc_para_visualizar, style="ent", jupyter=True)

In [41]:
from spacy.training import Example

def create_list_of_samples(docs, span_key):
    exemplos = []
    for doc in docs:
        doc_referencia = doc.copy()
        spans = doc.spans.get(span_key, [])
        anotacoes = {"entities": [(s.start_char, s.end_char, s.label_) for s in spans]}

        exemplo = Example.from_dict(doc_referencia, anotacoes)
        exemplos.append(exemplo)

    return exemplos

train_examples = create_list_of_samples(docs_train_hmm, "hmm")
dev_examples = create_list_of_samples(docs_dev_hmm, "hmm")

In [42]:
import spacy

nlp = spacy.blank("pt")
ner = nlp.add_pipe("ner")
ner.add_label("PESSOA")

1

In [43]:
import random
from spacy.training.loop import train
from spacy.util import minibatch

nlp.initialize(get_examples=lambda: train_examples)
N_ITER = 10
DROPOUT = 0.5

optimizer = nlp.create_optimizer()

for i in range(N_ITER):
    random.shuffle(train_examples)
    losses = {}

    for batch in minibatch(train_examples, size=8):
        nlp.update(batch, drop=DROPOUT, losses=losses, sgd=optimizer)

    print(f"Época {i+1}/{N_ITER}, Perdas: {losses}")
print("Treinamento concluído!")

Época 1/10, Perdas: {'ner': np.float32(8141.7734)}
Época 2/10, Perdas: {'ner': np.float32(1954.4944)}
Época 3/10, Perdas: {'ner': np.float32(1297.6627)}


KeyboardInterrupt: 

In [44]:
model_path = "./path_scpacy_model"
nlp.to_disk(model_path)

In [52]:
df_test

Unnamed: 0,sentences,tokens,ner_tokens,tokens_w_ner
0,"cremos que estes avisos , afixados em qualquer...","[cremos, que, estes, avisos, ,, afixados, em, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(cremos, O), (que, O), (estes, O), (avisos, O..."
1,iracema portella ) dispoe sobre os fundamentos...,"[iracema, portella, ), dispoe, sobre, os, fund...","[O, I-PESSOA, O, O, O, O, O, O, O, O, O, O, O,...","[(iracema, O), (portella, I-PESSOA), (), O), (..."
2,deputado hildo rocha,"[deputado, hildo, rocha]","[O, O, I-PESSOA]","[(deputado, O), (hildo, O), (rocha, I-PESSOA)]"
3,"por um lado , ha o programa escola aberta do g...","[por, um, lado, ,, ha, o, programa, escola, ab...","[O, O, O, O, O, O, O, O, O, O, O, O, O]","[(por, O), (um, O), (lado, O), (,, O), (ha, O)..."
4,esta lei entra em vigor na data de sua publica...,"[esta, lei, entra, em, vigor, na, data, de, su...","[O, O, O, O, O, O, O, O, O, O, O]","[(esta, O), (lei, O), (entra, O), (em, O), (vi..."
...,...,...,...,...
587,cabe destacar que este e um projeto com susten...,"[cabe, destacar, que, este, e, um, projeto, co...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(cabe, O), (destacar, O), (que, O), (este, O)..."
588,justificacao em grande parte do mundo e tambem...,"[justificacao, em, grande, parte, do, mundo, e...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(justificacao, O), (em, O), (grande, O), (par..."
589,"o reconhecimento da titulacao , por sua vez , ...","[o, reconhecimento, da, titulacao, ,, por, sua...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(o, O), (reconhecimento, O), (da, O), (titula..."
590,a proposicao tem como objetivo aprimorar a sis...,"[a, proposicao, tem, como, objetivo, aprimorar...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(a, O), (proposicao, O), (tem, O), (como, O),..."


In [56]:
def create_ground_truth_examples_from_tags(df_real):
    nlp = spacy.load("pt_core_news_lg")
    examples = []
    for _, row in df_real.iterrows():
        tokens = row['tokens']
        tags = row['ner_tokens']

        doc = nlp.make_doc(" ".join(tokens))

        entities = biluo_to_ents(doc, tags)
        doc.ents = entities

        example = Example(doc, doc)
        examples.append(example)

    return examples

ImportError: cannot import name 'biluo_to_ents' from 'spacy.training' (/usr/local/lib/python3.11/dist-packages/spacy/training/__init__.py)

In [65]:
import spacy
import pandas as pd
# No special spaCy training imports are needed now.

# --- STEP 1: LOAD YOUR TRAINED MODEL ---
model_path = "./path_scpacy_model"
nlp_trained = spacy.load(model_path)
print(f"Trained model loaded from '{model_path}'.")


# --- STEP 2: OUR NEW MANUAL HELPER FUNCTION ---
def bio_tags_to_entity_texts(tokens, tags):
    """
    Manually converts a list of tokens and BIO tags into a set of entity texts.
    This function replaces the need for spaCy's 'biluo_to_ents'.
    """
    entities = set()
    current_entity_tokens = []

    for token, tag in zip(tokens, tags):
        if tag == "I-PESSOA":
            if current_entity_tokens:
                entities.add(" ".join(current_entity_tokens))
                current_entity_tokens = []
            current_entity_tokens.append(token)
        else:
            if current_entity_tokens:
                entities.add(" ".join(current_entity_tokens))
                current_entity_tokens = []

    if current_entity_tokens:
        entities.add(" ".join(current_entity_tokens))

    return entities


true_positives = 0
false_positives = 0
false_negatives = 0

for row in df_test.itertuples():
    sentence_text = row.sentences
    true_tokens = row.tokens
    true_tags = row.ner_tokens

    # --- PREDICTIONS ---
    predicted_doc = nlp_trained(sentence_text)
    predicted_entities = {ent.text for ent in predicted_doc.ents if ent.label_ == "PESSOA"}

    # --- GROUND TRUTH entities ---
    true_entities = bio_tags_to_entity_texts(true_tokens, true_tags)

    # --- TP, FP, FN ---
    true_positives += len(predicted_entities.intersection(true_entities))
    false_positives += len(predicted_entities.difference(true_entities))
    false_negatives += len(true_entities.difference(predicted_entities))

print("Evaluation complete.")
precision = true_positives / (true_positives + false_positives + 1e-10)
recall = true_positives / (true_positives + false_negatives + 1e-10)
f1_score = 2 * (precision * recall) / (precision + recall + 1e-10)


print("\n--- FULLY MANUAL EVALUATION RESULTS ---")
print(f"True Positives (TP): {true_positives}")
print(f"False Positives (FP):    {false_positives}")
print(f"False Negatives (FN):    {false_negatives}")
print("-" * 35)
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1_score:.4f}")

Trained model loaded from './path_scpacy_model'.
Evaluation complete.

--- FULLY MANUAL EVALUATION RESULTS ---
True Positives (TP): 54
False Positives (FP):    2538
False Negatives (FN):    74
-----------------------------------
Precision: 0.0208
Recall:    0.4219
F1-Score:  0.0397
