In [5]:
import spacy
import json

# **POS and tokens - SPACY**

**"Quais as bolsas de iniciação científica da UFLA?"**

In [6]:
def pre_process(input_question):
    input_dict = json.loads(input_question)
    question = input_dict['question']

    nlp = spacy.load('pt_core_news_sm')
    doc = nlp(question)
    dic = {}
    tokens = []
    pos = []
    for token in doc:
        tokens.append(token.text)
        pos.append(token.pos_)

    dic["question"]=question
    dic['tokens']=tokens
    dic['POS']=pos
    
    output = json.dumps(dic, ensure_ascii=False)
    return output 

print(pre_process('{"question": "Quais as bolsas de iniciação científica da UFLA?"}'))

{"question": "Quais as bolsas de iniciação científica da UFLA?", "tokens": ["Quais", "as", "bolsas", "de", "iniciação", "científica", "da", "UFLA", "?"], "POS": ["SCONJ", "DET", "NOUN", "ADP", "NOUN", "ADJ", "ADP", "PROPN", "PUNCT"]}


# **POS and tokens - NLTK**

In [7]:
import nltk
import json
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [8]:
def pre_process(input_question):
    input_dict = json.loads(input_question)
    question = input_dict['question']
    
    tagged_sents = nltk.corpus.mac_morpho.tagged_sents()

    tokens = nltk.word_tokenize(question)
    unigram_tagger = nltk.tag.UnigramTagger(tagged_sents)
    parsings = unigram_tagger.tag(tokens)

    dic = {}
    tokens = []
    pos = []

    for parsing in parsings:
        tokens.append(parsing[0])
        pos.append(parsing[1])

    dic["question"]=question
    dic['tokens']=tokens
    dic['POS']=pos

    output = json.dumps(dic, ensure_ascii=False)
    return output 

print(pre_process('{"question": "Quais as bolsas de iniciação científica da UFLA?"}'))

{"question": "Quais as bolsas de iniciação científica da UFLA?", "tokens": ["Quais", "as", "bolsas", "de", "iniciação", "científica", "da", "UFLA", "?"], "POS": ["PROSUB", "ART", "N", "PREP", "N", "ADJ", "NPROP", null, "?"]}


# **POS and tokens - NLPNET**

In [9]:
import nlpnet
import json

In [10]:
def pre_process(input_question):
    input_dict = json.loads(input_question)
    question = input_dict['question']

    tagger = nlpnet.POSTagger('pos-pt', language='pt')

    sets = tagger.tag(question)

    dic = {}
    tokens = []
    pos = []

    for parsings in sets:
        for parsing in parsings:
            tokens.append(parsing[0])
            pos.append(parsing[1])

    dic["question"]=question
    dic['tokens']=tokens
    dic['POS']=pos

    output = json.dumps(dic, ensure_ascii=False)
    return output 

print(pre_process('{"question": "Quais as bolsas de iniciação científica da UFLA?"}'))

{"question": "Quais as bolsas de iniciação científica da UFLA?", "tokens": ["Quais", "as", "bolsas", "de", "iniciação", "científica", "da", "UFLA", "?"], "POS": ["PRO-KS", "ART", "N", "PREP", "N", "ADJ", "PREP+ART", "NPROP", "PU"]}
