Com o modelo já treinado, utiliza-se essa matriz para buscar as similaridades com os textos das buscas, cujos textos também passarão por uma etapa de preprocessamento.

In [4]:
import numpy as np
import pandas as pd
import pickle
import gensim
import spacy
from gensim.similarities.docsim import MatrixSimilarity
from nltk.corpus import stopwords
from gensim import corpora

nlp = spacy.load('pt_core_news_lg')

# * adding custom texts that dont represent real words
noises_list = ["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi"]

stopWords_list = stopwords.words("portuguese")

# * adding custom words to StopWords list
stopWords_list += [
    'referente',
    'seguinte'
]

# * preprocessing stopwords to correct format
stopWords_list = gensim.utils.simple_preprocess(" ".join(stopWords_list), deacc=True, min_len=1, max_len=40)

# * manual intervention, changing final lemmas
intervention_dict = {
    "campar": "campo",
    "seriar":"serie",
    "eletromagnetico":"eletromagnetismo",
}

def preprocess(text):
    # * importing stopwords from nltk and spacy pipeline
    global nlp
    global stopWords_list
    global noises_list
    global intervention_dict

    # * preprocessing text with gensim.simple_preprocess, eliminating noises: lowercase, tokenized, no symbols, no numbers, no accents marks(normatize)
    text_list = gensim.utils.simple_preprocess(text, deacc=True, min_len=1, max_len=40)

    # * recombining tokens to a string type object and removing remaining noises
    text_str = " ".join([word for word in text_list if word not in noises_list])

    # * preprocessing with spacy, retokenizing -> tagging parts of speech (PoS) -> parsing (assigning dependencies between words) -> lemmatizing
    text_doc = nlp(text_str)

    # * re-tokenization, removing stopwords and lemmatizing
    lemmatized_text_list = [token.lemma_ for token in text_doc if token.text not in stopWords_list]

    # * manual intervention conversion of lemmas
    output = []
    for token in lemmatized_text_list:
        if token in intervention_dict:
            output.append(intervention_dict[token])
        else:
            output.append(token)
            
    return output

tfidf_corpus = gensim.corpora.MmCorpus('tfidf_model_mm')
lsi_corpus = gensim.corpora.MmCorpus('lsi_model_mm')

with open('id2wordDict.txt', 'rb') as f:
    id2wordDict = pickle.load(f)

with open("tfidf_model.txt", 'rb') as f:
    tfidf_model = pickle.load(f)

with open("lsi_model.txt", 'rb') as f:
    lsi_model = pickle.load(f)

with open('crea_df_pickle.txt', 'rb') as f:
    crea_df = pickle.load(f)

subjects_df = pd.read_json("MechatronicsEngeneeringSubjects.json")

# * seção de Construção de Edificações
subsection1_df = crea_df[ crea_df['SUB-SETOR'] == 'Construção de Edificações']

cosineSimilarity = MatrixSimilarity(lsi_corpus, num_features = lsi_corpus.num_terms, num_best=8)

def search_similarity_query(search_document):

    # * preprocessing and processing until becomes a matrix of type term_to_topic (V)
    doc = preprocess(search_document)
    query_bow = id2wordDict.doc2bow(doc)
    query_tfidf = tfidf_model[query_bow]
    query_lsi = lsi_model[query_tfidf]

    # * cossine similarity between the vector of the new document vs all other vectors of documents
    # * returns a list of tuples (id of compared document, similarity)
    ranking = cosineSimilarity[query_lsi]

    ranking.sort(key=lambda unit: unit[1], reverse= True)
    result = []

    for subject in ranking:

        result.append (
            {
                'Relevancia': round((subject[1] * 100),6),
                'Código da Matéria': subjects_df['codigo'][subject[0]],
                'Nome da matéria': subjects_df['nome'][subject[0]]
            }

        )
    
    output = pd.DataFrame(result, columns=['Relevancia','Código da Matéria','Nome da matéria'])
    if len(output) == 0:
        return "O texto procurado não há similaridade com nenhuma das demais"
    return output

['serie']


# Single Queries

In [6]:
print("preprocessed:", preprocess("series"))
search_similarity_query("series")

['serie']


Unnamed: 0,Relevancia,Código da Matéria,Nome da matéria
0,77.58075,MAT0059,METODOS MATEMÁTICOS DA FÍSICA 1
1,61.28785,MAT0026,CÁLCULO 2
2,43.210536,MAT0028,VARIAVEL COMPLEXA 1
3,27.108598,ENE0038,LABORATÓRIO DE CIRCUITOS ELÉTRICOS 2
4,24.388134,ENE0037,CIRCUITOS ELÉTRICOS 2
5,12.979302,IFD0181,FISICA 3 EXPERIMENTAL
6,8.841469,ENE0047,CONVERSÃO ELETROMECÂNICA DE ENERGIA
7,7.871081,ENE0042,CIRCUITOS ELÉTRICOS 1


# Multi Queries

In [7]:
subsection1_df.head(15)

for i in range(15):
    text = subsection1_df.iloc[i, 8]
    print(f'texto buscado: {text}')
    print(f'texto buscado preprocessado: {preprocess(text)}')
    print( search_similarity_query(text), end='\n\n')

texto buscado: Cálculo diferencial e integral 
texto buscado preprocessado: ['calcular', 'diferencial', 'integral']
   Relevancia Código da Matéria        Nome da matéria
0   94.680548           MAT0025              CÁLCULO 1
1   29.648611           MAT0027              CÁLCULO 3
2   29.401502           MAT0028    VARIAVEL COMPLEXA 1
3   21.390662           ENE0045             ELETRÔNICA
4   15.841775           MAT0026              CÁLCULO 2
5   13.828565           ENE0037  CIRCUITOS ELÉTRICOS 2
6    9.978841           IFD0179               FISICA 3
7    9.956482           IFD0177  FISICA 2 EXPERIMENTAL

texto buscado: Cálculo numérico 
texto buscado preprocessado: ['calcular', 'numerico']
   Relevancia Código da Matéria                  Nome da matéria
0   83.769476           ENM0027  TECNOLOGIAS DE COMANDO NUMERICO
1   38.392425           MAT0053                 CALCULO NUMERICO
2   23.952121           MAT0027                        CÁLCULO 3
3   20.602839           MAT0025          