In [1]:
import jsonl_io
import re
import json
from nltk import word_tokenize
import os

In [2]:
os.makedirs('../dataset/metadata/', exist_ok=True)
os.makedirs('../dataset/all/', exist_ok=True)

In [3]:
def post_process(doc):
    def multiline_label(doc):
        final_text = ''
        prev_idx = 0
        text = doc['data']
        all_labels = sorted(doc['label'])

        for start, end, _ in all_labels:
            final_text += text[prev_idx:start]
            prev_idx = end
            ent_text = text[start:end].replace('\n', ' ')
            final_text += ent_text
            
        return final_text
    roman_numerals = re.compile(r"(^|\s)([IVX]+\.)\n", re.IGNORECASE)
    number = re.compile("(^|\s)(\d\.(?:\d\.*)*)\n")
    
    #fix multiline labels
    text = multiline_label(doc)
    #fix numberal numbers split
    text = re.sub(roman_numerals, r"\1\2 ", text)
    #fix numbered itens split
    text = re.sub(number, r"\1\2 ", text)

    doc['data'] = text
    
    return doc

In [4]:
annotated_data = jsonl_io.read_jsonl('../dataset/annotated.jsonl')

In [5]:
def offset_to_conll(text, labels):
    ents = []
    prev_idx = 0
    masked_text = ""
    for start, end, label in labels:
        prev_text = text[prev_idx:start]
        masked_text += prev_text
        masked_text += 'ENT'
        ent_text = text[start:end]
        tokenized_ent = word_tokenize(ent_text, language='portuguese', preserve_line=True)
        prev_idx = end
        iob_ents = []
        for idx, tok in enumerate(tokenized_ent):
            if not idx:
                iob_ents.append(f'{tok} B-{label}')
            else:
                iob_ents.append(f'{tok} I-{label}')
        ents.append(iob_ents)
    last_chunk = text[prev_idx:]
    masked_text += last_chunk
    
    final_text = ""
    
    ent_idx = 0
    for line in masked_text.splitlines():
        line_tokens = word_tokenize(line, language='portuguese', preserve_line=True)
        for token in line_tokens:
            if token == 'ENT':
                final_text += '\n'.join(ents[ent_idx])
                final_text += '\n'
                ent_idx += 1
            else:
                final_text += f'{token} O\n'
        final_text += '\n'
    
    return final_text

In [6]:
from tqdm import tqdm

In [7]:
for doc in tqdm(annotated_data):
    n_sei = doc['n_sei']
    metadata = {}
    metadata['numero_sei'] = n_sei
    metadata['origem'] = doc['origem']
    metadata['tipo_documento'] = doc['tipo_documento']
    metadata_filename = f'{n_sei}_meta.json'
    conll_filename = f'{n_sei}.conll'
    with open('../dataset/metadata/' + metadata_filename, 'w') as f:
        json.dump(metadata, f)
    fixed = post_process(doc)
    text = fixed['data']
    labels = sorted(fixed['label'])
    conll = offset_to_conll(text, labels)
    with open('../dataset/all/' + conll_filename, 'w') as f:
        f.write(conll)   

100%|██████████| 100/100 [00:11<00:00,  8.80it/s]
