In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from gensim.models import KeyedVectors
import nltk
import re
import json
from  pathlib import Path
from unidecode import unidecode

In [2]:
# load model
word2vec_model = KeyedVectors.load_word2vec_format("../resources/wordembeeding/glove_s50.txt", binary=False)

In [3]:
def avg_sentence_vector(words, model, num_features, index2word_set):
    # function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[word])

    if nwords > 0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

In [4]:
def clear_text(text):
    stopwords = nltk.corpus.stopwords.words('portuguese')
    u = unidecode(text)
    text_cleaned = re.sub(r"[^a-zA-Z ]", "", u)
    text_list = text_cleaned.split(" ")

    cleaned_list = [w.lower() for w in text_list if w.lower() not in stopwords and w.lower()]

    return ' '.join(cleaned_list)

In [5]:
path = Path("../resources/ufrpe/qa.txt")
with open(path, 'r') as f:
    qa = []
    for line in f.readlines():
        qa.append(json.loads(line))

In [6]:
with open('../resources/documento_oficial.json') as json_file:
    documento_oficial = json.load(json_file)

In [7]:
def compare(contextos, quesito):
    maxx = 0
    text = ""
    for k,v in contextos.items():
        for kk, vv in v.items():
            for vvv in vv:
                cleanded_1 = clear_text(vvv)
                sentence_1_avg_vector = avg_sentence_vector(cleanded_1, model=word2vec_model, num_features=50, index2word_set= set(word2vec_model.index2word))
                sentence_2_avg_vector = avg_sentence_vector(quesito, model=word2vec_model, num_features=50, index2word_set= set(word2vec_model.index2word))
                sen1_sen2_similarity = cosine_similarity(sentence_1_avg_vector.reshape(1,-1), sentence_2_avg_vector.reshape(1,-1))[0][0]
                if sen1_sen2_similarity > maxx:
                    maxx = sen1_sen2_similarity
                    text = cleanded_1
    return maxx, text

In [8]:
import random
def uniqueid():
    seed = random.getrandbits(32)
    while True:
       yield seed
       seed += 1

In [10]:
dataset = []
unique_sequence = uniqueid()
# https://github.com/ThilinaRajapakse/simpletransformers#question-answering usei essa estrutura
for enum,quesito in enumerate(qa):
    print("{}%".format((enum/len(qa))*100))
    q = clear_text(quesito["pergunta"])
    similaridade, contexto = compare(documento_oficial, q)
    dataset.append(
        {"context":contexto,
            'qas' : [{
                    'id':str(next(unique_sequence)),
                    'is_impossible':True,
                    'question':quesito["pergunta"],
                    'answers':[{
                        'text':quesito['resposta'],
                        'answer_start':0
                    }]
            }]
         }
    )

0.0%
1.0526315789473684%
2.1052631578947367%
3.1578947368421053%
4.2105263157894735%
5.263157894736842%
6.315789473684211%
7.368421052631578%
8.421052631578947%
9.473684210526317%
10.526315789473683%
11.578947368421053%
12.631578947368421%
13.684210526315791%
14.736842105263156%
15.789473684210526%
16.842105263157894%
17.894736842105264%
18.947368421052634%
20.0%
21.052631578947366%
22.105263157894736%
23.157894736842106%
24.210526315789473%
25.263157894736842%
26.31578947368421%
27.368421052631582%
28.421052631578945%
29.47368421052631%
30.526315789473685%
31.57894736842105%
32.631578947368425%
33.68421052631579%
34.73684210526316%
35.78947368421053%
36.84210526315789%
37.89473684210527%
38.94736842105263%
40.0%
41.05263157894737%
42.10526315789473%
43.15789473684211%
44.21052631578947%
45.26315789473684%
46.31578947368421%
47.368421052631575%
48.421052631578945%
49.473684210526315%
50.526315789473685%
51.578947368421055%
52.63157894736842%
53.68421052631579%
54.736842105263165%
55.78

In [11]:
import orjson

with open("../resources/data_final.json", 'wb') as f:
    f.write(orjson.dumps(dataset, option=orjson.OPT_NAIVE_UTC | orjson.OPT_SERIALIZE_NUMPY))

In [17]:
dataset[3]

{'context': 'atividades pedagogicas utilizadas computo carga horaria tambem poderao ser utilizadas instrumentos avaliacao',
 'qas': [{'id': '4133842354',
   'is_impossible': True,
   'question': 'As aulas serão disponibilizadas no AVA, SIGAA ou em alguma outra plataforma?',
   'answers': [{'text': 'Recomenda-se o uso do AVA e do SIGAA, mas outras plataformas como o Google Classroom também podem ser usadas. Caso o professor tenha aulas gravadas, ele poderá disponibilizá-las nestas plataformas. Caso sejam realizados encontros síncronos, estes encontros poderão ser gravados pelo professor e disponibilizados posteriormente, na plataforma adotada.',
     'answer_start': 0}]}]}