In [7]:
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
import re
import seaborn as sns
import nltk 
import heapq
from nltk import RegexpTokenizer as rpt
from nltk.corpus import stopwords as sw
from string import punctuation 

nltk.download('punkt')
nltk.download('stopwords')
stopwords = sw.words('portuguese')

data_url="./results.csv"
data = pd.read_csv(data_url).replace(np.nan, '', regex=True)
documents = data.text.count()
N = documents

def parse(text):
    words = []
    word_pattern = rpt(r'\w+')
    year_pattern = rpt(r'\d{4}')
    
    patterns = [word_pattern, year_pattern]
    
    for pattern in patterns:
        tokens = []
        for token in pattern.tokenize(text):
            if token not in stopwords and len(token) > 3:
                tokens.append(token)
        words.extend(tokens)
    return words


def build_index(dataset):
    document_index = 0
    index = {}
    M = len(dataset.text)
    for entry in dataset.text:
        document_index = document_index + 1
            
        for ngram in parse(entry):
            
            if ngram in index:
                if document_index in index[ngram]:
                    index[ngram][document_index] = index[ngram][document_index] + 1
                else:
                    index[ngram][document_index] = 1 
            else:
                index[ngram] = {document_index: 1}
                
    for i in index:
        k = len(index[i])
        index[i]["idf"] = math.log10((M + 1) / k)
    return index
                        
index = build_index(data)
print(index)

[nltk_data] Downloading package punkt to /home/gabrielsv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gabrielsv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'juíza': {1: 2, 2: 1, 'idf': 2.0969100130080562}, 'federal': {1: 1, 2: 1, 3: 1, 7: 2, 15: 1, 37: 2, 42: 1, 50: 1, 113: 1, 115: 1, 126: 1, 139: 1, 151: 3, 203: 1, 205: 1, 206: 2, 213: 2, 220: 1, 224: 1, 225: 1, 228: 1, 229: 1, 248: 1, 'idf': 1.0362121726544447}, 'Ivani': {1: 1, 2: 1, 'idf': 2.0969100130080562}, 'Silva': {1: 3, 2: 1, 6: 1, 14: 2, 26: 1, 73: 1, 76: 1, 115: 1, 183: 1, 226: 1, 234: 1, 236: 1, 'idf': 1.3187587626244128}, 'Brasília': {1: 1, 8: 1, 33: 1, 35: 1, 44: 1, 48: 1, 50: 1, 53: 1, 62: 1, 72: 1, 80: 1, 85: 1, 101: 1, 138: 1, 146: 1, 161: 1, 162: 1, 203: 1, 205: 1, 207: 2, 213: 1, 214: 1, 217: 4, 222: 1, 229: 1, 235: 1, 238: 1, 246: 1, 'idf': 0.9507819773298184}, 'proibiu': {1: 1, 2: 1, 119: 1, 162: 1, 'idf': 1.7958800173440752}, 'caráter': {1: 1, 15: 1, 36: 1, 60: 1, 89: 1, 97: 1, 120: 1, 149: 1, 239: 1, 247: 1, 'idf': 1.3979400086720377}, 'liminar': {1: 1, 2: 3, 119: 1, 217: 1, 'idf': 1.7958800173440752}, 'nesta': {1: 2, 4: 1, 8: 1, 22: 1, 23: 1, 24: 1, 26: 1, 27: 3, 