In [None]:
!pip3 install -U pip setuptools wheel
!pip3 install -U spacy

In [1]:
# https://ner.pythonhumanities.com/03_01_create_ner_training_set.html
# https://ner.pythonhumanities.com/03_02_train_spacy_ner_model.html
import json
import spacy
import json
import warnings
import random
from spacy.tokens import DocBin
from pathlib import Path
from tqdm import tqdm

import utils # local file

In [2]:
# carregar características
caracteristicas = utils.carregar_json("../data/caracteristicas.json")
# corrigir caracteristicas duplicadas
caracteristicas = list(set(caracteristicas))

In [3]:
pessoas = utils.carregar_json("../data/dados_pessoas.json")

In [4]:
nlp_pt = spacy.load("pt_core_news_lg")

In [5]:
docs = [nlp_pt(p["biografia"]) for p in tqdm(pessoas)]

100%|██████████| 1756/1756 [02:42<00:00, 10.80it/s]


In [6]:
# sentencas = [s.text for doc in docs for s in doc.sents] # lista de sentenças de todas as biografias
todas_sentencas = [s.text for doc in docs for s in doc.sents]
boas_sentencas = [
    s
    for s in todas_sentencas
    if any(c in s for c in ["foi", "é uma", "é um", "atuou", "atua", "era"])
]
melhores_sentencas = [next(doc.sents).text for doc in docs]


In [7]:
crcs_para_treinar = caracteristicas.copy()  # nenhuma característica foi treinada ainda
sentencas_para_treinar = []
# adicionar sentenças que contenham características
for sentenca in boas_sentencas:
    # verificar se a sentença contém características
    crcs_na_stc = [c for c in crcs_para_treinar if c in sentenca]

    if len(crcs_na_stc) > 0:  # se a sentença contém características
        sentencas_para_treinar.append(sentenca)  # adicionar a sentença
        # remover as características da lista de características para treinar
        crcs_para_treinar = [c for c in crcs_para_treinar if c not in crcs_na_stc]


In [8]:
sentencas_para_validar = melhores_sentencas.copy()

# remover sentenças para validar do conjunto que estão sendo usadas para treinar
sentencas_para_validar = [s for s in sentencas_para_validar if s not in sentencas_para_treinar]

len(sentencas_para_treinar), len(sentencas_para_validar)

(554, 1493)

In [9]:
patterns = [
    {"label": "CARACTERISTICA", "pattern": caracteristica}
    for caracteristica in caracteristicas
]

lista_treino = sentencas_para_treinar + random.sample(sentencas_para_validar[:400], 300)
lista_validacao = random.sample(sentencas_para_validar, 400)

In [10]:
TRAIN_DATA = utils.formatar(lista_treino, patterns)
VALID_DATA = utils.formatar(lista_validacao, patterns)

100%|██████████| 854/854 [00:01<00:00, 583.24it/s]
100%|██████████| 400/400 [00:00<00:00, 621.34it/s]


In [11]:
utils.salvar_json("../data/train_data.json", TRAIN_DATA)
utils.salvar_json("../data/valid_data.json", VALID_DATA)

In [12]:
utils.converter("pt", TRAIN_DATA, "../data/train.spacy")
utils.converter("pt", VALID_DATA, "../data/valid.spacy")

100%|██████████| 854/854 [00:01<00:00, 552.32it/s]
100%|██████████| 400/400 [00:00<00:00, 630.78it/s]


In [13]:
!python3 -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [14]:
!python3 -m spacy train config.cfg --output ../data/models --paths.train ../data/train.spacy --paths.dev ../data/valid.spacy

[38;5;4mℹ Saving to output directory: ../data/models[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-02-04 02:12:38,729] [INFO] Set up nlp object from config
[2023-02-04 02:12:38,795] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-02-04 02:12:38,814] [INFO] Created vocabulary
[2023-02-04 02:12:38,818] [INFO] Finished initializing nlp object
[2023-02-04 02:12:41,173] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     13.67    0.00    0.00    0.00    0.00
  0     200         49.32   1554.85   96.19   94.55   97.89    0.96
  2     400         48.02    337.65   98.01   97.18   98.85    0.98
  3     600         73.69    106.92   98.30   97.28   99.33    0.98
  5     800         43.44     55.34   98.25   97.2