In [1]:
tokenization_pattern = r'''al\.|[.,;:"“”'?():_`]|\w*(?:[^;"“”'?():_`\s]*\w+)+[%º°?']?|[\S]'''
#pattern = r'''([.](?=[\s]+[A-Z])|[\s\.]*[\n\r]+[\s]*(?=[A-Z]))'''
pattern = r'''((?<!\d)+[.](?=[\s]*[A-Z])|[\s\.]*[\n\r]+[\s]*(?=[A-Z]))'''

In [67]:
import re
def list_to_conllu_text(document_filename: str, conllu_list: list, phrases: list):
    value_list = []
    for s,(sentence,text) in enumerate(zip(conllu_list, phrases)):
        #text = " ".join([token[1] for token in sentence if '-' not in token[0]])
        sent_id = str(s+1)
        content = "\n".join(["\t".join(token[:]) for token in sentence])
        value_list.append("# sent_id = {}-{}\n# text = {}\n{}".format(document_filename, sent_id, text, content))
    return "\n\n".join(value_list)

def portuguese_sentenciation(text):
    #pattern = r'''([.](?=[\s]+[A-Z])|[\s\.]*[\n\r]+[\s]*(?=[A-Z]))'''
    original_values = re.split(pattern, text)
    values = ['{} {}'.format(f1,f2) for f1,f2 in zip(original_values[::2],original_values[1::2])]
    real_values = ['{}{}'.format(f1,f2) for f1,f2 in zip(original_values[::2],original_values[1::2])]
    for value,rvalue in zip(values,real_values):
        value = recursive_space_removal(value)
        rvalue = recursive_space_removal(rvalue)
        if "\n" in value or "\r" in value:
            continue
        yield value,rvalue


def recursive_space_removal(value):
    try:
        if value[0] == " ":
            value = value[1:]
            return recursive_space_removal(value)
        else:
            return value
    except IndexError:
        return value

def text_preprocessing(text):
    # TEXTBAR PREPROCESSING
    text = text.replace(" / ", "##SPACE-BAR-SPACE##")
    text = text.replace(" \ ", "##SPACE-LBAR-SPACE##")
    text = text.replace("/ ", "/")
    text = text.replace(r"\ ", "\\")
    text = text.replace(" /", "/")
    text = text.replace(r" \\", "\\")
    text = text.replace("##SPACE-BAR-SPACE##", r" / ")
    text = text.replace("##SPACE-LBAR-SPACE##", r" \ ")
    return text


In [68]:
def spans(txt, tokens):
    offset = 0
    all_spans = []
    for token in tokens:
        offset = txt.find(token, offset)
        all_spans.append((token, offset, offset + len(token)))
        offset += len(token)
    return all_spans

In [69]:
import sys
import os
import re
import datetime
import stanza
from stanza import Pipeline
from stanza.utils.conll import CoNLL
from stanza.models.common import doc
import numpy as np
import itertools
stanza.download('pt')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 115kB [00:00, 3.72MB/s]
2020-06-11 00:53:11 INFO: Downloading default packages for language: pt (Portuguese)...
2020-06-11 00:53:11 INFO: File exists: C:\Users\criss\stanza_resources\pt\default.zip.
2020-06-11 00:53:14 INFO: Finished downloading models and saved to C:\Users\criss\stanza_resources.


In [70]:
#input_file='data/6-20140908-MONOGRAFIA_0_intro.txt'

In [71]:
def get_mwts(token_sentence, tokens_by_id):
    mwts = []
    for i in range(len(token_sentence)):
        token = token_sentence[i]
        if '-' in token['id']:
            ids = token['id'].split('-')
            mwts.append((token['text'] , [tokens_by_id[idx] for idx in ids]))
    return mwts

In [72]:
#input_filename = '6-20140908-MONOGRAFIA_0_intro'
input_filename = '10-20150122-MONOGRAFIA_0_intro'
#input_filename = '0-20150121-TESEMSC_0_intro'
input_path='data/{}.txt'.format(input_filename)

In [73]:
def get_stanza_2_tagged(stanza_token_sentence, tagged_sentence):
    filtered_stanza_token_sentence = []
    skip_tokens_id = []
    for token in stanza_token_sentence:
        if token['id'] in skip_tokens_id:
            continue
        if '-' in token['id']:
            skip_tokens_id =[idx for idx in token['id'].split('-')]
        filtered_stanza_token_sentence.append(token)

    for i,token in enumerate(filtered_stanza_token_sentence):
        filtered_stanza_token_sentence[i]['start']=int(token['misc'].split('|')[0][11:])

    for i,token in enumerate(tagged_sentence):
        tagged_sentence[i]['start']=int(token['misc'].split('|')[0][11:])

    stanza_2_tagged = {}
    stanza_2_tagged['0']='0'
    for ftoken in filtered_stanza_token_sentence:
        compare=[]
        for i,ttoken in enumerate(tagged_sentence):
            ftext = set(ftoken['text'])
            ttext =set(ttoken['text'])
            if min(len(ftext-ttext),len(ttext-ftext))==0:
                compare.append((i, abs(ftoken['start'] - ttoken['start'])))
        if len(compare)==0:
            continue
            
        idx = min(compare, key=lambda x:x[1])[0]
        for f_id in ftoken['id'].split('-'):
            stanza_2_tagged[f_id] = tagged_sentence[idx]['id']
    
    return stanza_2_tagged

In [74]:
input_filename

'10-20150122-MONOGRAFIA_0_intro'

In [79]:
def merge_stanza_token_sentences(stanza_token_sentences):
    offset=0
    for i in range(len(stanza_token_sentences)):
        for j in range(len(stanza_token_sentences[i])):
            if 'head' in stanza_token_sentences[i][j]:
                stanza_token_sentences[i][j]['head']=str(int(stanza_token_sentences[i][j]['head'])+offset)

            if '-' in stanza_token_sentences[i][j]['id']:
                ids = stanza_token_sentences[i][j]['id'].split('-')
                stanza_token_sentences[i][j]['id'] = '-'.join([str(int(id)+offset) for id in ids])
            else:
                stanza_token_sentences[i][j]['id'] = str(int(stanza_token_sentences[i][j]['id']) + offset)

        offset=int(stanza_token_sentences[i][-1]['id'])
    stanza_token_sentences = list(itertools.chain.from_iterable(stanza_token_sentences))
    return stanza_token_sentences

In [80]:
def conllu_process_file(input_filename):
    input_path='data/{}.txt'.format(input_filename)
    infile = open(input_path, "rb")
    text = infile.read().decode('utf-8')
    preprocessed_text = text_preprocessing(text)
    #sentences = portuguese_sentenciation(preprocessed_text)
    tok_nlp = Pipeline(lang='pt', tokenize_pretokenized=False)#processors='tokenize,mwt', 
    tag_nlp = Pipeline(lang='pt', tokenize_pretokenized=True)#processors='tokenize,pos,lemma,depparse',

    sentences = portuguese_sentenciation(preprocessed_text)
    new_document=[]
    raw_texts=[]
    phrases = []
    for phrase,rphrase in sentences:
        if not phrase=='':
            phrases.append(rphrase)
            stanza_sentence = tok_nlp(phrase)
            stanza_token_sentences = stanza_sentence.to_dict()
            stanza_token_sentence = merge_stanza_token_sentences(stanza_token_sentences)
            tokens_by_id = {token['id']:token for token in stanza_token_sentence}
            mwt_cases = get_mwts(stanza_token_sentence, tokens_by_id)

            sentence = re.findall(tokenization_pattern, phrase)
            tagged_sentences = tag_nlp([sentence])     
            tagged_sentences = tagged_sentences.to_dict()
            tagged_sentence = tagged_sentences[0]
            assert len(tagged_sentence)==len(sentence)

            sentence = spans(rphrase,sentence)

            stanza_2_tagged = get_stanza_2_tagged(stanza_token_sentence, tagged_sentence)

            max_str = sentence[-1][-1]
            raw_text =" "*max_str
            sent = []
            ant2new = {}
            ant2new['0']=0
            token_id=1
            for (token,start_char,end_char),tagged_token in zip(sentence,tagged_sentence):
                idx=tagged_token['id']
                ant2new[idx]=token_id
                tagged_token.pop('key',None)
                tagged_token.pop('misc',None)
                tagged_token.pop('id',None)
                tagged_token.pop('text',None)

                if not len(mwt_cases)==0:
                    mwt_token_key = mwt_cases[0][0]
                    mwt_tokens = mwt_cases[0][1]

                    if token == mwt_token_key:
                        token_id0 = token_id
                        token_id = token_id + len(mwt_tokens)-1
                        token_information = {doc.ID: '{}-{}'.format(token_id0, token_id), doc.TEXT: token, \
                                     doc.MISC: f'start_char={start_char}|end_char={end_char}'}
                        token_information = {**token_information, **tagged_token}
                        sent.append(token_information)
                        for i,c in enumerate(mwt_tokens):
                            try:
                                c.pop('id')
                            except: 
                                print(c)

                            c['head'] = stanza_2_tagged[str(c['head'])]
                            token_information = {doc.ID: str(token_id0 + i)}
                            sent.append({**token_information, **c})
                        mwt_cases = mwt_cases[1:]
                    else:
                        token_information = {doc.ID: str(token_id), doc.TEXT: token, \
                                 doc.MISC: f'start_char={start_char}|end_char={end_char}'}
                        token_information = {**token_information, **tagged_token}
                        sent.append(token_information)

                else:
                    token_information = {doc.ID: str(token_id), doc.TEXT: token, \
                                 doc.MISC: f'start_char={start_char}|end_char={end_char}'}
                    token_information = {**token_information, **tagged_token}
                    sent.append(token_information)

                token_id+=1

                raw_text = raw_text[:start_char] + token + raw_text[end_char:]
            raw_texts.append(raw_text)
            for i in range(len(sent)):
                new_head = ant2new[str(sent[i]['head'])]
                sent[i]['head'] = int(new_head)

            new_document.append(sent)
    raw_texts='\n\n'.join(raw_texts)
    phrases_str = '\n\n'.join(phrases)
    new_document=doc.Document(new_document, phrases_str)
    segmented_text = new_document.to_dict()
    conllu = CoNLL.convert_dict(segmented_text)
    output = list_to_conllu_text(input_filename, conllu, phrases)  
    return output

In [81]:
output = conllu_process_file(input_filename=input_filename)

2020-06-11 00:55:22 INFO: Loading these models for language: pt (Portuguese):
| Processor | Package |
-----------------------
| tokenize  | bosque  |
| mwt       | bosque  |
| pos       | bosque  |
| lemma     | bosque  |
| depparse  | bosque  |

2020-06-11 00:55:22 INFO: Use device: cpu
2020-06-11 00:55:22 INFO: Loading: tokenize
2020-06-11 00:55:22 INFO: Loading: mwt
2020-06-11 00:55:22 INFO: Loading: pos
2020-06-11 00:55:23 INFO: Loading: lemma
2020-06-11 00:55:23 INFO: Loading: depparse
2020-06-11 00:55:24 INFO: Done loading processors!
2020-06-11 00:55:24 INFO: Loading these models for language: pt (Portuguese):
| Processor | Package |
-----------------------
| tokenize  | bosque  |
| mwt       | bosque  |
| pos       | bosque  |
| lemma     | bosque  |
| depparse  | bosque  |

2020-06-11 00:55:24 INFO: Use device: cpu
2020-06-11 00:55:24 INFO: Loading: tokenize
2020-06-11 00:55:24 INFO: Loading: mwt
2020-06-11 00:55:24 INFO: Loading: pos
2020-06-11 00:55:25 INFO: Loading: lemma
2

In [82]:
print(output)

# sent_id = 10-20150122-MONOGRAFIA_0_intro-1
# text = ﻿1. Introdução O princípio de funcionamento remonta à 1801, quando um gerador com célula a combustível usando uma pilha de zinco e oxigênio produzia o zincato de sódio (Sir Humphrey, Inglaterra).
1	﻿1	﻿1	PROPN	_	Gender=Masc|Number=Sing	8	nsubj	_	start_char=0|end_char=2
2	.	.	PUNCT	_	_	1	punct	_	start_char=2|end_char=3
3	Introdução	introdução	NOUN	_	Gender=Fem|Number=Sing	1	appos	_	start_char=4|end_char=14
4	O	o	DET	_	Definite=Def|Gender=Masc|Number=Sing|PronType=Art	5	det	_	start_char=15|end_char=16
5	princípio	princípio	NOUN	_	Gender=Masc|Number=Sing	8	nsubj	_	start_char=17|end_char=26
6	de	de	ADP	_	_	7	case	_	start_char=27|end_char=29
7	funcionamento	funcionamento	NOUN	_	Gender=Masc|Number=Sing	5	nmod	_	start_char=30|end_char=43
8	remonta	remontar	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	_	start_char=44|end_char=51
9-10	à	_	_	_	_	_	_	_	start_char=52|end_char=53
9	a	ela	PRON	_	Case=Acc|Gender=Fem|Number=S