In [0]:
%%capture

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from collections import Counter
import math
from IPython.display import display

nltk.download('stopwords')

In [0]:
data = pd.read_csv('https://raw.githubusercontent.com/dnrocha1/information_retrieval/master/lab02/data/results.csv')

In [0]:
#pre-process

stopwords = stopwords.words('portuguese')
documents = data['text'].apply(lambda x: x.lower())

regex = RegexpTokenizer(r'\b[A-zÀ-ú-\'\d]{3,}')
# tokens = regex.tokenize(texts)

In [0]:
M = len(documents)

def build_index(documents=documents):
  inverted_list = {}

  n_doc = 0
  for document in documents:
    token = [w for w in regex.tokenize(document) if w not in stopwords]
    counter = list(Counter(token).items())
    for elem in counter:
      key = elem[0]
      freq = elem[1]
      if key in inverted_list.keys():
        if n_doc not in inverted_list[key][0]:
          inverted_list[key].append((n_doc,freq))
      else:
        inverted_list[key] = [(n_doc,freq)]
    n_doc += 1
  
  for elem in inverted_list:
    k = len(inverted_list[elem])
    idf = math.log((M+1)/k)
    inverted_list[elem].append(idf)
  
  return inverted_list

index = build_index()

vocabulary = list(index.keys())

In [0]:
# binary

def vec_model_bin(query,document,index=index):
  q = {}
  d = {}
  
  for word in vocabulary:
    q[word] = 0
    if word in query.split():
      q[word] = 1
      
  tokens = [w for w in regex.tokenize(document) if w not in stopwords]
  for word in vocabulary:
    d[word] = 0
    if word in list(set(tokens)):
      d[word] = 1
      
  acc = 0
  for word in vocabulary:
    if q[word] > 0 and d[word] > 0:
      acc = acc + q[word]*d[word]
      
  return acc

# vec_model_bin("forças armadas",documents[149])

In [0]:
# tf

def vec_model_tf(query,document,index=index):
  q = {}
  d = {}
  
  for word in vocabulary:
    q[word] = 0
    if word in query.split():
      q[word] = 1
  
  tokens = [w for w in regex.tokenize(document) if w not in stopwords]
  counter = Counter(tokens)
  
  for word in vocabulary:
    d[word] = 0
    if word in list(counter.keys()):
      d[word] = counter[word]
      
  acc = 0
  for word in vocabulary:
    if q[word] > 0 and d[word] > 0:
      acc = acc + q[word]*d[word]
      
  return acc

# vec_model_tf("juíza federal",documents[0])

In [0]:
# tf-idf

def vec_model_tf_idf(query,document,index=index):
  q = {}
  d = {}
  
  for word in vocabulary:
    q[word] = 0
    if word in query.split():
      q[word] = 1
  
  tokens = [w for w in regex.tokenize(document) if w not in stopwords]
  counter = Counter(tokens)
  
  for word in vocabulary:
    d[word] = 0
    if word in list(counter.keys()):
      d[word] = counter[word]
      
  acc = 0
  for word in vocabulary:
    idf = index[word][-1]
    if q[word] > 0 and d[word] > 0:
      acc = acc + q[word]*d[word]*idf
      
  return acc

# vec_model_tf_idf("juíza federal",documents[14])

In [0]:
# bm25

k = 1
def bm25(query,document,index=index):
  doc_tokens = [w for w in regex.tokenize(document) if w not in stopwords]
  query_words = query.split()
  
  matched_words = list(set(doc_tokens) & set(query_words))
  
  score = 0
  
  for word in matched_words:
    cwq = query_words.count(word)
    cwd = doc_tokens.count(word)
    dfw = len(index[word][:-1])
    y = ((k+1)*cwd)/(cwd+k)
    score += cwq * y * math.log((M+1)/dfw)
    
  
  return score

# bm25("forças armadas",documents[149])

In [0]:
def get_best(query,fun):
  doc_id = 0
  scores = []
  docs = {}
  for document in documents:
    score = fun(query,document)
    docs[doc_id] = score
    doc_id += 1
    
  return sorted(docs.items(), key=lambda kv:kv[1], reverse=True)

def build_query_results(query):
  funs = [vec_model_bin,vec_model_tf,vec_model_tf_idf,bm25]
  table = {}
  for fun in funs:
    top = get_best(query,fun)
    table[fun.__name__] = top
    
  return pd.DataFrame(table)

## 1. Escolha um documento dentre aqueles da base do aluno Bernardi e crie uma consulta que você acha que tem boas chances de recuperar este documento. Em seguida, avalie os resultados de tal consulta usando a métrica de avaliação Reciprocal Rank

In [10]:
doc_num = 17
query = 'twitter bolsonaro'
print(f"Título do documento: '{data.iloc[doc_num].title}'")
print(f"Subtítulo do documento: '{data.iloc[doc_num].subtitle}'")
print(f"Consulta: '{query}'")

Título do documento: 'Um tuíte muito vulgar'
Subtítulo do documento: 'Bolsonaro lança cortina de fumaça nas redes sociais em detrimento da imagem institucional do Brasil'
Consulta: 'twitter bolsonaro'


In [11]:
results = build_query_results(query)
display(results.head(10))

Unnamed: 0,vec_model_bin,vec_model_tf,vec_model_tf_idf,bm25
0,"(17, 2)","(150, 47)","(150, 63.606491175962056)","(18, 6.08007067362192)"
1,"(18, 2)","(165, 38)","(165, 51.62023559343967)","(204, 5.845771438984137)"
2,"(104, 2)","(206, 35)","(206, 46.61321615425373)","(17, 5.344219743078769)"
3,"(119, 2)","(18, 29)","(18, 41.65718183427425)","(150, 4.950346835958886)"
4,"(139, 2)","(41, 12)","(204, 16.352864493393646)","(165, 4.936924377299847)"
5,"(150, 2)","(207, 12)","(41, 15.98167411002985)","(104, 4.626503388947136)"
6,"(165, 2)","(215, 11)","(207, 15.98167411002985)","(202, 4.474296968851615)"
7,"(202, 2)","(204, 10)","(215, 14.64986793419403)","(214, 4.474296968851615)"
8,"(204, 2)","(110, 8)","(17, 11.345845054207706)","(228, 4.474296968851615)"
9,"(208, 2)","(224, 8)","(110, 10.654449406686567)","(139, 4.341116351268032)"


In [12]:
def reciprocal_rank(col,doc_id):
  n = 1
  for doc,score in col:
    if doc == doc_id:
      return round(1 / n, 2)
    else:
      n += 1
  return 0

ranking_rr = {}
for col in results:
  ranking_rr[col] = reciprocal_rank(results[col].head(10), doc_num)
  
rr = pd.DataFrame(ranking_rr.items(),columns=['model','reciprocal'])
display(rr)

Unnamed: 0,model,reciprocal
0,vec_model_bin,1.0
1,vec_model_tf,0.0
2,vec_model_tf_idf,0.11
3,bm25,0.33


## 2. A partir do gabarito fornecido em OBS1, calcule o MAP para cada algoritmo abaixo e aponte qual obteve o melhor resultado. Para os cálculos do MAP, considere que um documento é relevante para uma dada consulta se este documento estiver entre os documentos do gabarito para essa consulta, senão ele deve ser considerado irrelevante.