## Importando Libs

In [None]:
# jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
import re
import PyPDF2
from collections import OrderedDict
import numpy as np
import spacy
import stanza
from spacy.lang.pt.stop_words import STOP_WORDS
import os
import json
import nltk
import requests

## Criando a TextRank

In [None]:
#TextRank4Keyword Reference: https://gist.github.com/BrambleXu/3d47bbdbd1ee4e6fc695b0ddb88cbf99
nlp = spacy.load('pt_core_news_lg')
nlp2 = stanza.Pipeline(lang='pt', processors='tokenize,mwt,pos,lemma') 

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        all_keys = []
        
        for i, (key, value) in enumerate(node_weight.items()):
            all_keys.append(key)
            if i > number:
                break
               
        return all_keys
    
    def lemmatize(self, text):
        doc = nlp2(text)
        lemmatized_words = []
        stopwords_list = nltk.corpus.stopwords.words('portuguese')
        ignored_postag = ['DET', 'NUM', 'ADV', 'SCONJ', 'CCONJ', 'DET', 'AUX', 'ADP', 'PRON','VERB']
        
        for word in doc.iter_words():
            if word.upos not in ignored_postag and word.lemma not in stopwords_list:
                lemmatized_words.append(word.lemma.lower())
        
        return " ".join(lemmatized_words)

    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        #Detect compound noun
        #text = self.detect_compound(text) Needs a better compound noun detection (The last version had a low detection rate in portuguese)
        
        #Lemmatization
        text = self.lemmatize(text)
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            if word != '_':
                node_weight[word] = pr[index]
                
        self.node_weight = node_weight

## Gerando keywords dos textos

In [None]:
#Set year and text_type or create a new file structure
year = '2021'
text_type = 'plano'
files = os.listdir(f'{year}/{text_type}')

In [None]:
def text_cleaning(full_raw_text):
    full_raw_text = re.sub(r"\(.*?\)", '', full_raw_text)
    full_raw_text = re.sub(r"([A-Z]+)\s?\,\s?([A-Za-z\s]+)[\.\;]", '', full_raw_text) 
    full_raw_text = re.sub(r"([Dd]ispon[ií]vel\s[Ee]m|[Aa]vailable\s[Ff]rom)\s?:", '', full_raw_text)
    full_raw_text = re.sub(r"\<?((https?)?:\/\/|www)\S+", '', full_raw_text)
    full_raw_text = re.sub(r"[0-9]", '', full_raw_text)

    full_raw_text = full_raw_text.replace('\n','')


    punctuations = ['!','@','#','$','%','&','*','_','+','=','<','>',',','.',';',':','?','/','|','(',')']

    for k in range(len(punctuations)):
        full_raw_text  = full_raw_text.replace(punctuations[k], '')

    full_raw_text = full_raw_text.lower() 
    
    return full_raw_text

In [None]:
f = open("keywords_all_year.json") #YOU NEED TO HAVE A JSON FILE TO USE (PREFERABLY EMPTY)

all_text_year = list(json.load(f))
print(type(all_text_year))


for i in files:
    text_dict = {}
    
    print(i)
    file = open(f'{year}/{text_type}/{i}', 'rb')
    pdf = PyPDF2.PdfFileReader(file)
    pages_text = []


    for j in range(pdf.getNumPages()):
        page = pdf.getPage(j)
        pages_text.append(page.extractText())

    full_raw_text = "".join(pages_text)
    full_raw_text = text_cleaning(full_raw_text)
        
    stopwords = nltk.corpus.stopwords.words('portuguese')
    tr4w = TextRank4Keyword()
    tr4w.analyze(full_raw_text, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False, stopwords=stopwords)
    
    text_dict['year'] = year
    text_dict['text_type'] = text_type
    text_dict['name'] = i
    text_dict['keywords'] = tr4w.get_keywords(150)
    
    all_text_year.append(text_dict)

In [None]:
# Serializing json 
json_object = json.dumps(all_text_year, indent = 4)
  
# Writing to keywords_all_year.json
with open("keywords_all_year.json", "w") as outfile:
    outfile.write(json_object)

In [None]:
len(all_text_year)