In [0]:
%%capture

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from collections import Counter
import math
from tabulate import tabulate

nltk.download('stopwords')

In [0]:
data = pd.read_csv('https://raw.githubusercontent.com/dnrocha1/information_retrieval/master/lab02/data/results.csv')

In [0]:
#pre-process

stopwords = stopwords.words('portuguese')
documents = data['text'].apply(lambda x: x.lower())

regex = RegexpTokenizer(r'\b[A-zÀ-ú-\'\d]{3,}')
# tokens = regex.tokenize(texts)

In [0]:
M = len(documents)

def build_index(documents=documents):
  inverted_list = {}

  n_doc = 0
  for document in documents:
    token = [w for w in regex.tokenize(document) if w not in stopwords]
    counter = list(Counter(token).items())
    for elem in counter:
      key = elem[0]
      freq = elem[1]
      if key in inverted_list.keys():
        if n_doc not in inverted_list[key][0]:
          inverted_list[key].append((n_doc,freq))
      else:
        inverted_list[key] = [(n_doc,freq)]
    n_doc += 1
  
  for elem in inverted_list:
    k = len(inverted_list[elem])
    idf = math.log((M+1)/k)
    inverted_list[elem].append(idf)
  
  return inverted_list

index = build_index()

TENTAR MUDAR INTERNAMENTE NOS METODOS PRA NÃO TOKENIZAR E SIM BUSCAR A PARTIR DO INDICE

In [5]:
vocabulary = list(index.keys())

def vec_model_bin(query,document,index=index):
  q = {}
  d = {}
  
  for word in vocabulary:
    q[word] = 0
    if word in query.split():
      q[word] = 1
      
  tokens = [w for w in regex.tokenize(document) if w not in stopwords]
  for word in vocabulary:
    d[word] = 0
    if word in list(set(tokens)):
      d[word] = 1
      
  acc = 0
  for word in vocabulary:
    if q[word] > 0 and d[word] > 0:
      acc = acc + q[word]*d[word]
      
  return acc

vec_model_bin("forças armadas",documents[149])

2

In [6]:
def vec_model_tf(query,document,index=index):
  q = {}
  d = {}
  
  for word in vocabulary:
    q[word] = 0
    if word in query.split():
      q[word] = 1
  
  tokens = [w for w in regex.tokenize(document) if w not in stopwords]
  counter = Counter(tokens)
  
  for word in vocabulary:
    d[word] = 0
    if word in list(counter.keys()):
      d[word] = counter[word]
      
  acc = 0
  for word in vocabulary:
    if q[word] > 0 and d[word] > 0:
      acc = acc + q[word]*d[word]
      
  return acc

vec_model_tf("juíza federal",documents[0])

4

In [7]:
def vec_model_tf_idf(query,document,index=index):
  q = {}
  d = {}
  
  for word in vocabulary:
    q[word] = 0
    if word in query.split():
      q[word] = 1
  
  tokens = [w for w in regex.tokenize(document) if w not in stopwords]
  counter = Counter(tokens)
  
  for word in vocabulary:
    d[word] = 0
    if word in list(counter.keys()):
      d[word] = counter[word]
      
  acc = 0
  for word in vocabulary:
    idf = index[word][-1]
    if q[word] > 0 and d[word] > 0:
      acc = acc + q[word]*d[word]*idf
      
  return acc

vec_model_tf_idf("juíza federal",documents[14])

1.6502599069543555

In [8]:
vec_model_tf_idf("forças armadas", documents[149])
# vec_model_tf("forças armadas", documents[149])

33.323497612614815

In [40]:
k = 1
def bm25(query,document,index=index):
  doc_tokens = [w for w in regex.tokenize(document) if w not in stopwords]
  query_words = query.split()
  
  matched_words = list(set(doc_tokens) & set(query_words))
  
  score = 0
  
  for word in matched_words:
    cwq = query_words.count(word)
    cwd = doc_tokens.count(word)
    dfw = len(index[word][:-1])
    y = ((k+1)*cwd)/(cwd+k)
    score += cwq * y * math.log((M+1)/dfw)
    
  
  return score

bm25("forças armadas",documents[149])

7.906309290883032

In [10]:
results = []
best_k = []
for k in [x * 0.1 for x in range(0,200,5)]:
  res = bm25("forças armadas",documents[149],k)
  results.append(res)
  best_k.append(k)
  
df = pd.DataFrame({'res':results, 'k':best_k})

# df.quantile([0.25,0.5,0.75]).iloc[0,:].k
df.quantile([0.25,0.5,0.75])

Unnamed: 0,res,k
0.25,15.89355,4.875
0.5,20.831447,9.75
0.75,23.588632,14.625


In [41]:
print(bm25("forças armadas",documents[149]))
k = 4.875
print(bm25("forças armadas",documents[149]))

7.906309290883032
15.89904328901099


In [0]:
queries = ["forças armadas", "governo federal", "golpe de estado"]

In [0]:
def get_top5(query,fun):
  doc_id = 0
  scores = []
  docs = {}
  for document in documents:
    score = fun(query,document)
    docs[doc_id] = score
    doc_id += 1
    
  return sorted(docs.items(), key=lambda kv:kv[1], reverse=True)[:5]
    
# pd.DataFrame(get_top5(queries[0],bm25),columns=['Document','Score'])

In [14]:
# binary representation
for query in queries:
  top5 = get_top5(query,vec_model_bin)
  df = pd.DataFrame(top5,columns=['Document','Score'])
  print(query)
  print(tabulate(df, headers='keys', tablefmt='psql'))
  print('\n')

forças armadas
+----+------------+---------+
|    |   Document |   Score |
|----+------------+---------|
|  0 |          0 |       2 |
|  1 |          5 |       2 |
|  2 |         11 |       2 |
|  3 |         24 |       2 |
|  4 |         41 |       2 |
+----+------------+---------+


governo federal
+----+------------+---------+
|    |   Document |   Score |
|----+------------+---------|
|  0 |          2 |       2 |
|  1 |         23 |       2 |
|  2 |         33 |       2 |
|  3 |         36 |       2 |
|  4 |         41 |       2 |
+----+------------+---------+


golpe de estado
+----+------------+---------+
|    |   Document |   Score |
|----+------------+---------|
|  0 |          0 |       2 |
|  1 |          1 |       2 |
|  2 |          2 |       2 |
|  3 |         24 |       2 |
|  4 |         97 |       2 |
+----+------------+---------+




In [15]:
# tf
for query in queries:
  top5 = get_top5(query,vec_model_tf)
  df = pd.DataFrame(top5,columns=['Document','Score'])
  print(query)
  print(tabulate(df, headers='keys', tablefmt='psql'))
  print('\n')

forças armadas
+----+------------+---------+
|    |   Document |   Score |
|----+------------+---------|
|  0 |        149 |      15 |
|  1 |         24 |       9 |
|  2 |        165 |       8 |
|  3 |        207 |       8 |
|  4 |          0 |       6 |
+----+------------+---------+


governo federal
+----+------------+---------+
|    |   Document |   Score |
|----+------------+---------|
|  0 |        172 |      19 |
|  1 |        165 |      14 |
|  2 |        247 |      13 |
|  3 |        114 |      12 |
|  4 |        228 |      11 |
+----+------------+---------+


golpe de estado
+----+------------+---------+
|    |   Document |   Score |
|----+------------+---------|
|  0 |         24 |      14 |
|  1 |          2 |       8 |
|  2 |          6 |       8 |
|  3 |        165 |       8 |
|  4 |        248 |       8 |
+----+------------+---------+




In [16]:
# tf-idf
for query in queries:
  top5 = get_top5(query,vec_model_tf_idf)
  df = pd.DataFrame(top5,columns=['Document','Score'])
  print(query)
  print(tabulate(df, headers='keys', tablefmt='psql'))
  print('\n')

forças armadas
+----+------------+---------+
|    |   Document |   Score |
|----+------------+---------|
|  0 |        149 | 33.3235 |
|  1 |         24 | 19.8608 |
|  2 |        165 | 17.9503 |
|  3 |        207 | 17.9503 |
|  4 |          0 | 13.4627 |
+----+------------+---------+


governo federal
+----+------------+---------+
|    |   Document |   Score |
|----+------------+---------|
|  0 |        172 | 19.902  |
|  1 |        247 | 13.8181 |
|  2 |        165 | 13.1778 |
|  3 |        114 | 12.1678 |
|  4 |        228 | 12.0446 |
+----+------------+---------+


golpe de estado
+----+------------+---------+
|    |   Document |   Score |
|----+------------+---------|
|  0 |         24 | 32.7731 |
|  1 |          2 | 18.1906 |
|  2 |        165 | 16.9378 |
|  3 |        164 | 14.5074 |
|  4 |        207 | 13.3297 |
+----+------------+---------+




In [42]:
# bm25
for query in queries:
  top5 = get_top5(query,bm25)
  df = pd.DataFrame(top5,columns=['Document','Score'])
  print(query)
  print(tabulate(df, headers='keys', tablefmt='psql'))
  print('\n')

forças armadas
+----+------------+---------+
|    |   Document |   Score |
|----+------------+---------|
|  0 |        149 | 15.899  |
|  1 |         24 | 12.5069 |
|  2 |        165 | 11.8826 |
|  3 |        207 | 11.8826 |
|  4 |          0 | 10.0436 |
+----+------------+---------+


governo federal
+----+------------+---------+
|    |   Document |   Score |
|----+------------+---------|
|  0 |        172 | 8.30144 |
|  1 |        247 | 7.19566 |
|  2 |        228 | 6.93044 |
|  3 |        114 | 6.32266 |
|  4 |        219 | 6.04141 |
+----+------------+---------+


golpe de estado
+----+------------+----------+
|    |   Document |    Score |
|----+------------+----------|
|  0 |         24 | 11.5622  |
|  1 |        165 |  9.89063 |
|  2 |          2 |  9.59458 |
|  3 |        164 |  9.24245 |
|  4 |          0 |  8.44819 |
+----+------------+----------+


