In [0]:
%%capture

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from collections import Counter
import math
from tabulate import tabulate

nltk.download('stopwords')

In [0]:
data = pd.read_csv('https://raw.githubusercontent.com/dnrocha1/information_retrieval/master/lab02/data/results.csv')

In [0]:
#pre-process

stopwords = stopwords.words('portuguese')
documents = data['text'].apply(lambda x: x.lower())

regex = RegexpTokenizer(r'\b[A-zÀ-ú-\'\d]{3,}')
# tokens = regex.tokenize(texts)

In [0]:
M = len(documents)

def build_index(documents=documents):
  inverted_list = {}

  n_doc = 0
  for document in documents:
    token = [w for w in regex.tokenize(document) if w not in stopwords]
    counter = list(Counter(token).items())
    for elem in counter:
      key = elem[0]
      freq = elem[1]
      if key in inverted_list.keys():
        if n_doc not in inverted_list[key][0]:
          inverted_list[key].append((n_doc,freq))
      else:
        inverted_list[key] = [(n_doc,freq)]
    n_doc += 1
  
  for elem in inverted_list:
    k = len(inverted_list[elem])
    idf = math.log((M+1)/k)
    inverted_list[elem].append(idf)
  
  return inverted_list

index = build_index()

TENTAR MUDAR INTERNAMENTE NOS METODOS PRA NÃO TOKENIZAR E SIM BUSCAR A PARTIR DO INDICE

In [5]:
vocabulary = list(index.keys())

def vec_model_bin(query,document,index=index):
  q = {}
  d = {}
  
  for word in vocabulary:
    q[word] = 0
    if word in query.split():
      q[word] = 1
      
  tokens = [w for w in regex.tokenize(document) if w not in stopwords]
  for word in vocabulary:
    d[word] = 0
    if word in list(set(tokens)):
      d[word] = 1
      
  acc = 0
  for word in vocabulary:
    if q[word] > 0 and d[word] > 0:
      acc = acc + q[word]*d[word]
      
  return acc

vec_model_bin("forças armadas",documents[149])

2

In [6]:
def vec_model_tf(query,document,index=index):
  q = {}
  d = {}
  
  for word in vocabulary:
    q[word] = 0
    if word in query.split():
      q[word] = 1
  
  tokens = [w for w in regex.tokenize(document) if w not in stopwords]
  counter = Counter(tokens)
  
  for word in vocabulary:
    d[word] = 0
    if word in list(counter.keys()):
      d[word] = counter[word]
      
  acc = 0
  for word in vocabulary:
    if q[word] > 0 and d[word] > 0:
      acc = acc + q[word]*d[word]
      
  return acc

vec_model_tf("juíza federal",documents[0])

4

In [7]:
def vec_model_tf_idf(query,document,index=index):
  q = {}
  d = {}
  
  for word in vocabulary:
    q[word] = 0
    if word in query.split():
      q[word] = 1
  
  tokens = [w for w in regex.tokenize(document) if w not in stopwords]
  counter = Counter(tokens)
  
  for word in vocabulary:
    d[word] = 0
    if word in list(counter.keys()):
      d[word] = counter[word]
      
  acc = 0
  for word in vocabulary:
    idf = index[word][-1]
    if q[word] > 0 and d[word] > 0:
      acc = acc + q[word]*d[word]*idf
      
  return acc

vec_model_tf_idf("juíza federal",documents[14])

1.6502599069543555

In [8]:
vec_model_tf_idf("forças armadas", documents[149])
# vec_model_tf("forças armadas", documents[149])

33.323497612614815

In [40]:
k = 1
def bm25(query,document,index=index):
  doc_tokens = [w for w in regex.tokenize(document) if w not in stopwords]
  query_words = query.split()
  
  matched_words = list(set(doc_tokens) & set(query_words))
  
  score = 0
  
  for word in matched_words:
    cwq = query_words.count(word)
    cwd = doc_tokens.count(word)
    dfw = len(index[word][:-1])
    y = ((k+1)*cwd)/(cwd+k)
    score += cwq * y * math.log((M+1)/dfw)
    
  
  return score

bm25("forças armadas",documents[149])

7.906309290883032

In [10]:
results = []
best_k = []
for k in [x * 0.1 for x in range(0,200,5)]:
  res = bm25("forças armadas",documents[149],k)
  results.append(res)
  best_k.append(k)
  
df = pd.DataFrame({'res':results, 'k':best_k})

# df.quantile([0.25,0.5,0.75]).iloc[0,:].k
df.quantile([0.25,0.5,0.75])

Unnamed: 0,res,k
0.25,15.89355,4.875
0.5,20.831447,9.75
0.75,23.588632,14.625


In [41]:
print(bm25("forças armadas",documents[149]))
k = 4.875
print(bm25("forças armadas",documents[149]))

7.906309290883032
15.89904328901099


In [0]:
queries = ["forças armadas", "governo federal", "golpe de estado"]

In [0]:
def get_top5(query,fun):
  doc_id = 0
  scores = []
  docs = {}
  for document in documents:
    score = fun(query,document)
    docs[doc_id] = score
    doc_id += 1
    
  return sorted(docs.items(), key=lambda kv:kv[1], reverse=True)

def build_query(query):
  funs = [vec_model_bin,vec_model_tf,vec_model_tf_idf,bm25]
  table = {}
  for fun in funs:
    top = get_top5(query,fun)
    table[fun.__name__] = top
    
  return pd.DataFrame(table)
    
# pd.DataFrame(get_top5(queries[0],bm25),columns=['Document','Score'])
# get_top5(queries[0],bm25)
# table = build_query("forças armadas")
# table.head(5)

In [68]:
for query in queries:
  table = build_query(query)
  print(query)
  print(tabulate(table.head(5), headers='keys', tablefmt='psql'))
  print('\n')

forças armadas
+----+-----------------+----------------+---------------------------+---------------------------+
|    | vec_model_bin   | vec_model_tf   | vec_model_tf_idf          | bm25                      |
|----+-----------------+----------------+---------------------------+---------------------------|
|  0 | (0, 2)          | (149, 15)      | (149, 33.323497612614815) | (149, 15.89904328901099)  |
|  1 | (5, 2)          | (24, 9)        | (24, 19.860802780873335)  | (24, 12.506924563304118)  |
|  2 | (11, 2)         | (165, 8)       | (165, 17.95025977565531)  | (165, 11.882566330363375) |
|  3 | (24, 2)         | (207, 8)       | (207, 17.95025977565531)  | (207, 11.882566330363375) |
|  4 | (41, 2)         | (0, 6)         | (0, 13.462694831741484)   | (0, 10.043597731616662)   |
+----+-----------------+----------------+---------------------------+---------------------------+


governo federal
+----+-----------------+----------------+---------------------------+----------------

In [82]:
# get only docs
table_docs = table.applymap(lambda x: x[0])
table_docs

Unnamed: 0,vec_model_bin,vec_model_tf,vec_model_tf_idf,bm25
0,0,24,24,24
1,1,2,2,165
2,2,6,165,2
3,24,165,164,164
4,97,248,207,0


In [160]:
print(query)

for model in table_docs:
  docs = table_docs[model]
  df = data[['title','subtitle']].iloc[docs]
  print(model)
#   print(tabulate(df,headers='keys',tablefmt='psql'))
  print(df)
  print('\n')
    

golpe de estado
vec_model_bin
                                                title                                           subtitle
0   “A sociedade foi Rubens Paiva  não os facínora...  A decisão da juíza que proíbe as Forças Armada...
1   Justiça suspende decisão que proibia Forças Ar...  Liminar havia sido concedida na sexta-feira a ...
2   Governo Bolsonaro prega “negacionismo históric...  Marcos Napolitano  professor da USP  diz que o...
24  Boris Fausto e o golpe de 64: “É impossível ir...  Historiador diz que as Forças Armadas nunca re...
97  Maduro anuncia 30 dias de racionamento de ener...  Presidente adota as medidas após os graves ble...


vec_model_tf
                                                 title                                           subtitle
24   Boris Fausto e o golpe de 64: “É impossível ir...  Historiador diz que as Forças Armadas nunca re...
2    Governo Bolsonaro prega “negacionismo históric...  Marcos Napolitano  professor da USP  diz que o...
6    “L