In [2]:
import spacy
import csv
import os
import re
import random
from tqdm import tqdm
from skweak.heuristics import FunctionAnnotator
from skweak.base import CombinedAnnotator
from skweak.aggregation import HMM
from skweak.gazetteers import Trie, GazetteerAnnotator
from skweak.utils import display_entities
from skweak import utils

# Preprocessing

In [3]:
nlp_pt = spacy.load("pt_core_news_sm")

In [4]:
path = './Documentos Coletados/'

def extract_from_docs(path):    
    '''Extrai as sentencas a partir de vários arquivos em um diretório'''
    
    files = os.listdir(path)
    docs = []

    for filename in tqdm(files, position=0):
        if os.path.splitext(filename)[1] == ".txt":
            with open(os.path.join(path, filename)) as f:
                file = f.readlines()
                for line in file:
                    if line != '\n':
                        line = line
                        docs.append(nlp_pt(line))
            # docs.extend([nlp_pt(line) for line in file if line != '\n'])

    print('{} files processed'.format(len(files)))
    print('{} sentences found'.format(len(docs)))

In [5]:
def extract_from_file(filepath):
    '''
    Extrai as sentencas a partir de um único arquivo no qual elas estao separadas por quebra de linha.
    Retorna uma lista de sentencas do tipo Doc do Spacy.
    '''
    
    sentences = []
    with open(filepath) as f:
        lines = f.readlines()
        for l in tqdm(lines, position=0):
            if l != '\n':
                sentences.append(nlp_pt(l))
    
    return sentences
                

In [6]:
filepath = './Coleta de sentencas/text_sentences_extracted.txt'
sentences = extract_from_file(filepath)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41834/41834 [02:01<00:00, 343.71it/s]


# Labelling functions

## Detecting generic laws

In [7]:
def isRoman(token):
    return bool(re.search(r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$",token))

In [8]:
def law_detector(doc):
    label = 'LEGISLACAO'
    variations_n = ['n.', 'n.º', 'nº', 'n°.', 'n°']
    variations_law = ['lei', 'decreto-lei', 'decreto', 'lc']
    variations_art = ['arts.', 'arts', 'art.', 'artigo', 'art', 'artigos']
    variations_paragraph = ['parágrafo', 'paragrafo', '§']
    varitions_inciso = ['inciso', 'incisos', 'inc.']
    variations_alinea = ['alinea', 'alínea']
    laws_types = ['federal','complementar','municipal', 'estadual']
    
    
    for token in doc: 
        #identificar leis
        if token.text.lower() in variations_law:
            if token.nbor(1).text[0].isdigit():
                yield token.i, token.i+2, label
            elif token.nbor(1).text.lower() in variations_n and token.nbor(2).text[0].isdigit():
                yield token.i, token.i+3, label
        
        
        elif token.text.lower() == 'lei' and token.nbor(1).text.lower() in laws_types:
            if token.nbor(2).text[0].isdigit(): ##Lei Federal 9.093/95
                yield token.i, token.i+3, label
            if token.nbor(2).text.lower() in variations_n and token.nbor(3).text[0].isdigit(): ##Lei Federal nº 9.093/95
                yield token.i, token.i+3, label
        
        #identificar artigos
        elif token.text.lower() in variations_art:
            if token.nbor(1).text[0].isdigit():
                yield token.i, token.i+2, label
            elif token.nbor(1).text == '.' and token.nbor(2).text[0].isdigit():
                yield token.i, token.i+3, label
        elif token.i > 3 and token.text[0].isdigit():
            if token.nbor(-3).text.lower() == 'arts' or token.nbor(-4).text.lower() == 'arts':
                yield token.i, token.i+1, label
                
        #identificar paragrafos
        elif token.text in variations_paragraph:
            if token.nbor(1).text[0].isdigit():
                yield token.i, token.i+2, label
            elif token.nbor(1).text.lower() == 'único': ##parágrafo único
                yield token.i, token.i+2, label
        
        #identificar incisos
        elif token.text.lower() in varitions_inciso:
            if isRoman(token.nbor(1).text):
                yield token.i, token.i+2, label
                
        elif token.i != 0 and token.i < len(doc)-1 and isRoman(token.text):
            if token.nbor(-1).text in [',','e'] and token.nbor(1).text in [',','e']:
                yield token.i, token.i+1, label
                
        elif token.i > 3 and isRoman(token.text):
            if token.nbor(-3).text.lower() == 'incisos':#<--
                yield token.i, token.i+1, label
                
        #identificar alineas
        elif token.text.lower() in variations_alinea and len(token.nbor(1).text) == 1:
            yield token.i, token.i+2, label
        
        #idenficar caput
        elif token.text.lower() == 'caput':
            yield token.i, token.i+1, label

                    

In [9]:
law_detector = FunctionAnnotator("law_detector", law_detector)

## Detecting named laws

In [10]:
file = open('./apelidos_leis.csv')
csv_reader = csv.reader(file, delimiter=';')
next(csv_reader) #skip header
names = []
for row in tqdm(csv_reader, total=1046, position=0):
    name = row[1]
    tokenized_name = [t.text for t in nlp_pt(name)]
    names.append(tokenized_name)

1080it [00:04, 268.09it/s]                                                                                                                                                                            


In [11]:
trie = Trie(names)
law_names_detector = GazetteerAnnotator('apelidos', {'LEGISLACAO':trie}, case_sensitive=False)

## Apply

In [12]:
combined = CombinedAnnotator()
combined.add_annotators(law_detector, law_names_detector)

sentences = list(combined.pipe(sentences))

# Aggregation

In [13]:
#hmm = HMM("hmm", ["LEGISLACAO","APELIDO"])

In [14]:
#hmm.fit_and_aggregate([doc])

# Visualize

In [15]:
#filter only labeled sentences
sentences_labeled = []
n_labels = 0
for s in sentences:
    if s.spans['law_detector'] or s.spans['apelidos']:
        sentences_labeled.append(s)
        
        #count
        n_labels += len(s.spans['law_detector'])
        n_labels += len(s.spans['apelidos'])
        
    
#docs_labeled = [d for d in docs if d.spans['law_detector'] or d.spans['apelidos']]

print('{} sentences labeled'.format(len(sentences_labeled)))
print('{} labels found'.format(n_labels))

2526 sentences labeled
7362 labels found


## Random Sentences

In [16]:
# random.shuffle(sentences_labeled)
for s in sentences_labeled:
    display_entities(s, ['law_detector','apelidos'])
    print('--------------------------------------')

--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


--------------------------------------


## Split train and test datasets

In [17]:
test_size_per = 0.15

In [18]:
split = int(len(sentences_labeled) * (1-test_size_per))
print('Tamanho do conjunto de treinamento: {}'.format(split))
print('Tamanho do conjunto de teste: {}'.format(len(sentences_labeled)-split))

Tamanho do conjunto de treinamento: 2147
Tamanho do conjunto de teste: 379


In [19]:
sentences_train_labeled = sentences_labeled[:split]
sentences_test_labeled = sentences_labeled[split:]

# Save

In [20]:
def see_tokenization(sentence):
    doc = nlp_pt(sentence)
    for t in doc:
        print(t)

def check_labels_skweak(sentence):
    doc = [nlp_pt(sentence)]
    sentence_labeled = list(combined.pipe(doc))[0]
    display_entities(sentence_labeled, ['law_detector','apelidos'])

In [21]:
def get_span_indexes(doc, lst_span_labels):
    start_idx = []
    middle_idx = []
    for span_label in lst_span_labels:
        spans = doc.spans[span_label]
        for s in spans:
            start_idx.append(s.start)
            rangee = list(range(s.start+1,s.end))
            middle_idx.extend(rangee)
    
    return start_idx, middle_idx

In [22]:
def save(docs, output_csv):
    writer = open(output_csv, mode='w')
    
    for doc in docs:
        start_idx, middle_idx = get_span_indexes(doc, ['law_detector', 'apelidos'])
        for token in doc:
            if (token.i in start_idx):
                ent_type = 'B-LEGISLACAO'
            elif (token.i in middle_idx):
                ent_type = 'I-LEGISLACAO'
            elif (token.text == '\n'):
                continue
            else:
                ent_type = 'O'
                
            line = '{} {}\n'.format(token.text, ent_type)
            writer.write(line)
        
        writer.write('\n')
            
    writer.close() 

In [175]:
# save(sentences_labeled, 'dataset_annotated_v3.txt')
# save(sentences_train_labeled, 'train_annotated.txt')
# save(sentences_test_labeled, 'test_annotated.txt')

**Example**

In [23]:
spans_law_detector = sentences_labeled[5].spans['law_detector']
for s in spans_law_detector:
    print(s, s.label_)

artigos 33 LEGISLACAO
Lei nº 11.343/2006 LEGISLACAO


In [25]:
text = "A hipótese do inciso III compreende a "
check_labels_skweak(text)
see_tokenization(text)

A
hipótese
do
inciso
III
compreende
a
