In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from gensim.models import KeyedVectors
import nltk
import re
import json
from  pathlib import Path
from unidecode import unidecode

In [3]:
# load model
word2vec_model = KeyedVectors.load_word2vec_format("../resources/wordembeeding/glove_s50.txt", binary=False)

In [4]:
def avg_sentence_vector(words, model, num_features, index2word_set):
    # function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[word])

    if nwords > 0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

In [5]:
def clear_text(text):
    stopwords = nltk.corpus.stopwords.words('portuguese')
    u = unidecode(text)
    text_cleaned = re.sub(r"[^a-zA-Z ]", "", u)
    text_list = text_cleaned.split(" ")

    cleaned_list = [w.lower() for w in text_list if w.lower() not in stopwords and w.lower()]

    return ' '.join(cleaned_list)

In [6]:
path = Path("../resources/ufrpe/qa.txt")
with open(path, 'r') as f:
    qa = []
    for line in f.readlines():
        qa.append(json.loads(line))

In [7]:
with open('../resources/documento_oficial.json') as json_file:
    documento_oficial = json.load(json_file)

In [8]:
def compare(contextos, quesito):
    maxx = 0
    text = ""
    for k,v in contextos.items():
        for kk, vv in v.items():
            for vvv in vv:
                cleanded_1 = clear_text(vvv)
                sentence_1_avg_vector = avg_sentence_vector(cleanded_1, model=word2vec_model, num_features=50, index2word_set= set(word2vec_model.index2word))
                sentence_2_avg_vector = avg_sentence_vector(quesito, model=word2vec_model, num_features=50, index2word_set= set(word2vec_model.index2word))
                sen1_sen2_similarity = cosine_similarity(sentence_1_avg_vector.reshape(1,-1), sentence_2_avg_vector.reshape(1,-1))[0][0]
                # print("Sentenca 1 {} - Sentenca 2 {} - similaridade {} - Maior ? {}".format(cleanded_1, text, sen1_sen2_similarity, sen1_sen2_similarity > maxx))
                if sen1_sen2_similarity > maxx:
                    maxx = sen1_sen2_similarity
                    text = cleanded_1


    return maxx, text

In [None]:
dataset = []
for enum,quesito in enumerate(qa):
    print("{}%".format((enum/len(qa))*100))
    q = clear_text(quesito["pergunta"])
    similaridade, text = compare(documento_oficial, quesito)
    quesito['similaridade'] =  similaridade
    quesito['contexto'] = text
    dataset.append(quesito)


0.0%
1.0526315789473684%
2.1052631578947367%
3.1578947368421053%
4.2105263157894735%
5.263157894736842%
6.315789473684211%
7.368421052631578%
8.421052631578947%
9.473684210526317%
10.526315789473683%
11.578947368421053%
12.631578947368421%
13.684210526315791%
14.736842105263156%
15.789473684210526%
16.842105263157894%
17.894736842105264%
18.947368421052634%
20.0%
21.052631578947366%
22.105263157894736%
23.157894736842106%
24.210526315789473%
25.263157894736842%
26.31578947368421%
27.368421052631582%
28.421052631578945%
29.47368421052631%
30.526315789473685%
31.57894736842105%
32.631578947368425%
33.68421052631579%
34.73684210526316%
35.78947368421053%
36.84210526315789%
37.89473684210527%
38.94736842105263%
40.0%
41.05263157894737%
42.10526315789473%
43.15789473684211%
44.21052631578947%
45.26315789473684%
46.31578947368421%
47.368421052631575%
48.421052631578945%
49.473684210526315%
50.526315789473685%
51.578947368421055%
52.63157894736842%
53.68421052631579%
54.736842105263165%
55.78

In [27]:
import orjson

with open("../resources/data_final.json", 'wb') as f:
    f.write(orjson.dumps(dataset, option=orjson.OPT_NAIVE_UTC | orjson.OPT_SERIALIZE_NUMPY))

In [30]:
with open("../resources/data_final.json", "rb") as f:
    dataset = orjson.loads(f.read())

[{'pergunta': 'Quando começam as aulas do PLE?',
  'resposta': 'O primeiro PLE (2020.3) iniciará no dia 17 de agosto. Confira o Calendário Acadêmico com todas as datas ',
  'link': 'http://www.ufrpe.br/br/content/quando-come%C3%A7am-aulas-do-ple',
  'similaridade': 0.64995855,
  'contexto': 'capitulo ix orientacoes pesquisa extensao cursos degraduacao uaeadtecart processos orientacao atividades pesquisa eextensao serao realizados modo remoto apoio plataformas tecnologias digitais bemcomo auxilio ferramentas ava unidade academica educacao distancia etecnologia uaeadtecart docentes orientadores projetos pesquisa pibicpic eoutros ensino eou extensao sonus bext poderao criar salas virtuais orientacoes noava uaeadtec visando registro atividades orientacao apoiar discentes nasacoes propostas'},
 {'pergunta': 'Nem o SIGAA nem o AVA podem transmitir as aulas online. Temos na UFRPE uma plataforma para isso?',
  'resposta': 'Os emails da UFRPE estão sendo migrados para o Google. Com isso, o prof