Importando as matérias do curso de Mecatrônica, processando-os e adicionando uma coluna de "Documentos" no Dataframe das Matérias:

Esse dataframe será salvo no arquivo: "documents_df.txt" no formato binário do Pickle.

In [1]:
import numpy as np
import pandas as pd
import pickle
import gensim
import spacy

from nltk.corpus import stopwords

nlp = spacy.load('pt_core_news_lg')

# * adding custom texts that dont represent real words
noises_list = ["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi"]

stopWords_list = stopwords.words("portuguese")

# * adding custom words to StopWords list
stopWords_list += [
    'referente',
    'seguinte'
]

# * preprocessing stopwords to correct format
stopWords_list = gensim.utils.simple_preprocess(" ".join(stopWords_list), deacc=True, min_len=1, max_len=40)

# * manual intervention, changing final lemmas
intervention_dict = {
    "campar": "campo",
    "seriar":"serie",
    "eletromagnetico":"eletromagnetismo",
}

def preprocess(text):
    # * importing stopwords from nltk and spacy pipeline
    global nlp
    global stopWords_list
    global noises_list
    global intervention_dict

    # * preprocessing text with gensim.simple_preprocess, eliminating noises: lowercase, tokenized, no symbols, no numbers, no accents marks(normatize)
    text_list = gensim.utils.simple_preprocess(text, deacc=True, min_len=1, max_len=40)

    # * recombining tokens to a string type object and removing remaining noises
    text_str = " ".join([word for word in text_list if word not in noises_list])

    # * preprocessing with spacy, retokenizing -> tagging parts of speech (PoS) -> parsing (assigning dependencies between words) -> lemmatizing
    text_doc = nlp(text_str)

    # * re-tokenization, removing stopwords and lemmatizing
    lemmatized_text_list = [token.lemma_ for token in text_doc if token.text not in stopWords_list]

    # * manual intervention conversion of lemmas
    output = []
    for token in lemmatized_text_list:
        if token in intervention_dict:
            output.append(intervention_dict[token])
        else:
            output.append(token)
            
    return output

subjects_df = pd.read_json("MechatronicsEngeneeringSubjects.json")

documents_list = []

for i, row in subjects_df.iterrows():

    # * reading values of each subject (row)
    subject_id = row["codigo"]
    name = row["nome"]
    syllabus = row["ementa"]
    content = row["conteudo"]

    # * combining them to create the subject document
    text = name + ' ' + syllabus + ' ' + content
    
    # * preprocessing
    preProcessedText = preprocess(text)
    documents_list.append(preProcessedText)

documents_series = pd.Series(documents_list, name="documento")

documents_df = pd.concat([subjects_df, documents_series], axis=1)

with open('documents_df.txt', 'wb') as f:
    pickle.dump(documents_df, f)

# * crea topics
crea_df = pd.read_excel("Matriz_do_Conhecimento.xls", skiprows=4).iloc[:, :9].fillna(method="ffill")
crea_df = crea_df.replace({"TÓPICOS": np.NaN, "Nº DE ORDEM DOS TÓPICOS": np.NaN})

with open('crea_df_pickle.txt', 'wb') as f:
    pickle.dump(crea_df, f)

