In [1]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from collections import Counter
import heapq as hp

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/dnrocha1/information_retrieval/master/lab02/data/results.csv')

In [3]:
#pre-process

documents = data['text'].apply(lambda x: x.lower())

regex = RegexpTokenizer(r'\b[A-zÀ-ú-\'\d]{3,}')
# tokens = regex.tokenize(texts)

In [4]:
def build_index(documents=documents):
  inverted_list = {}

  n_doc = 0
  for document in documents:
    n_doc += 1
    token = regex.tokenize(document)
    counter = list(Counter(token).items())
    for elem in counter:
      key = elem[0]
      freq = elem[1]
      if key in inverted_list.keys():
        if n_doc not in inverted_list[key][0]:
          inverted_list[key].append((n_doc,freq))
      else:
        inverted_list[key] = [(n_doc,freq)]
  
  return inverted_list

index = build_index()

In [5]:
%%capture

def get_doc_list(postings):
  
  def filter_n_doc(postings=postings):
    return [[e[0] for e in elem] for elem in postings]
  
  docs = filter_n_doc()
  flat_list = [item for sublist in docs for item in sublist]
  return list(set(flat_list))
    

get_doc_list(index.values())

In [6]:
def document_at_time(query, index, k):
  query_words = query.split()    
  inverted_lists = []
  priority_queue = []

  for word in query_words:
    if word in index.keys():
      inverted_lists.append(index[word])

  docs = get_doc_list(index.values())
  for d in docs:
    score = 0
    for inv in inverted_lists:
      for posting in inv:
        if d == posting[0]:
          score += posting[1]
    if score != 0:
      priority_queue.append((score,d))
    
  hp._heapify_max(priority_queue)
  
  top_k = []
  
  for i in range(1,k+1):
    if priority_queue != []:
      top = hp._heappop_max(priority_queue)
      top_k.append(top)
  
  return top_k

document_at_time("juíza federal", index, 10)

[(5, 151),
 (4, 173),
 (4, 1),
 (3, 248),
 (3, 229),
 (3, 220),
 (3, 214),
 (3, 7),
 (2, 239),
 (2, 228)]

In [7]:
def term_at_time(query, index, k):
  query_words = query.split()    
  inverted_lists = []
  acc = {}

  for word in query_words:
    if word in index.keys():
      inverted_lists.append(index[word])

  for lst in inverted_lists:
    for posting in lst:
      d = posting[0]
      freq = posting[1]
      if d in acc.keys():
        acc[d] = acc[d] + freq
      else:
        acc[d] = freq
  
  priority_queue = list(map(lambda elem: (elem[1],elem[0]), acc.items()))
  
  hp._heapify_max(priority_queue)
  
  top_k = []
  
  for i in range(1,k+1):
    if priority_queue != []:
      top = hp._heappop_max(priority_queue)
      top_k.append(top)
  
  return top_k

term_at_time("juíza federal", index, 10)  

[(5, 151),
 (4, 173),
 (4, 1),
 (3, 248),
 (3, 229),
 (3, 220),
 (3, 214),
 (3, 7),
 (2, 239),
 (2, 228)]

In [8]:
queries = ["educação","polícia","cortes","presidente","segurança"]

In [9]:
pd.options.display.max_colwidth = 120

results_document = []
results_term = []
k = 10

for query in queries:
  d = document_at_time(query, index, k)
  results_document.append(d)
  
  t = term_at_time(query, index, k)
  results_term.append(t)

df = pd.DataFrame()

df['query'] = queries
df['document_at_time'] = results_document
df['term_at_time'] = results_term
df['compare'] = df.document_at_time == df.term_at_time

df

Unnamed: 0,query,document_at_time,term_at_time,compare
0,educação,"[(22, 221), (11, 222), (7, 130), (6, 239), (5, 160), (5, 37), (4, 215), (4, 110), (3, 233), (3, 205)]","[(22, 221), (11, 222), (7, 130), (6, 239), (5, 160), (5, 37), (4, 215), (4, 110), (3, 233), (3, 205)]",True
1,polícia,"[(8, 151), (4, 214), (4, 93), (3, 241), (3, 150), (3, 65), (2, 249), (2, 230), (2, 207), (2, 181)]","[(8, 151), (4, 214), (4, 93), (3, 241), (3, 150), (3, 65), (2, 249), (2, 230), (2, 207), (2, 181)]",True
2,cortes,"[(2, 136), (1, 217), (1, 203), (1, 138), (1, 98), (1, 94), (1, 37), (1, 20)]","[(2, 136), (1, 217), (1, 203), (1, 138), (1, 98), (1, 94), (1, 37), (1, 20)]",True
3,presidente,"[(16, 166), (15, 63), (12, 151), (11, 216), (11, 19), (9, 205), (9, 86), (8, 25), (7, 174), (6, 235)]","[(16, 166), (15, 63), (12, 151), (11, 216), (11, 19), (9, 205), (9, 86), (8, 25), (7, 174), (6, 235)]",True
4,segurança,"[(6, 239), (6, 65), (3, 151), (3, 134), (3, 12), (2, 247), (2, 179), (2, 153), (2, 150), (2, 143)]","[(6, 239), (6, 65), (3, 151), (3, 134), (3, 12), (2, 247), (2, 179), (2, 153), (2, 150), (2, 143)]",True
