In [15]:
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
import re
import seaborn as sns
import nltk 
import heapq
from nltk import RegexpTokenizer as rpt
from nltk.corpus import stopwords as sw
from string import punctuation 

nltk.download('punkt')
nltk.download('stopwords')
stopwords = sw.words('portuguese')

data_url="./results.csv"
data = pd.read_csv(data_url).replace(np.nan, '', regex=True)
documents = data.text.count()
N = documents

def parse(text):
    words = []
    word_pattern = rpt(r'\w+')
    year_pattern = rpt(r'\d{4}')
    
    patterns = [word_pattern, year_pattern]
    
    for pattern in patterns:
        tokens = []
        for token in pattern.tokenize(text):
            if token not in stopwords and len(token) > 3:
                tokens.append(token)
        words.extend(tokens)
    return words


def build_index(dataset):
    document_index = 0
    index = {}
    M = len(dataset.text)
    for entry in dataset.text:
        document_index = document_index + 1
            
        for ngram in parse(entry):
            
            if ngram in index:
                if document_index in index[ngram]:
                    index[ngram][document_index] = index[ngram][document_index] + 1
                else:
                    index[ngram][document_index] = 1 
            else:
                index[ngram] = {document_index: 1}
                
    for i in index:
        k = len(index[i])
        index[i]["idf"] = math.log10((M + 1) / k)
    return index
                        
index = build_index(data)

def bin_query_vector(index, query):
    query_vector = []
    
    for word in index:
        if word in query.split():
            query_vector.append(True)
        else:
            query_vector.append(False)
            
    return query_vector

def bin_document_vector(index):
    document_vector = []
    
    for doc_id in range(1,documents+1):
        doc_vec = []
        for ngram in index:
            if doc_id in index[ngram].keys():
                doc_vec.append(True)
            else:
                doc_vec.append(False)
                
        document_vector.append(doc_vec)
    
    return document_vector
                

def f_bin(query_vector, doc_vector):
    rec = {}

    for doc_id in range(len(doc_vector)):
        sim = 0
        vector = doc_vector[doc_id]
        for i in range(len(vector)):
            sim += (query_vector[i] * vector[i])
        rec[doc_id+1] = sim
    
    return rec

def binary_vsm(index, query):
    query_vector = bin_query_vector(index, query)   
    doc_vector = bin_document_vector(index)
    
    return f_bin(query_vector, doc_vector)

def get_top10rank(score):
   
    df_tmp = pd.DataFrame(score.items(), columns=["document", "score"])
    df_tmp['r']= df_tmp.score.rank(ascending=False, method="first")
    df_tmp.sort_values("r", inplace = True)
    df_tmp = df_tmp[:10]
        
    return df_tmp

def tf_document_vector(index):
    document_vector = []
    
    for doc_id in range(1,documents+1):
        doc_vec = []
        for ngram in index:
            if doc_id in index[ngram].keys():
                y = index[ngram][doc_id]
                doc_vec.append(y)
            else:
                doc_vec.append(0)
                
        document_vector.append(doc_vec)
        
    return document_vector

def tf_query_vector(index, query):
    query_vector = []
    
    for ngram in index:
        w = 0
        for term in query.split():
            if ngram == term:
                w += 1
        query_vector.append(w)
        
    return query_vector

def f_tf(query_vector, doc_vector):
    rec = {}
    for doc_id in range(len(doc_vector)):
        sim = 0
        vector = doc_vector[doc_id]
        for i in range(len(vector)):
            sim += (query_vector[i] * vector[i])
        rec[doc_id+1] = sim
    return rec

def tf_vsm(index, query):
    query_vector = tf_query_vector(index, query)   
    doc_vector = tf_document_vector(index)
    
    return f_tf(query_vector, doc_vector)

def tfidf_document_vector(index):
    document_vector = []
    
    for doc_id in range(1,documents+1):
        doc_vec = []
        for ngram in index:
            if doc_id in index[ngram].keys():
                y = index[ngram][doc_id] * index[ngram]['idf']
                doc_vec.append(y)
            else:
                doc_vec.append(0)
                
        document_vector.append(doc_vec)
        
    return document_vector


def tfidf_vsm(index, query):
    query_vector = tf_query_vector(index, query)   
    doc_vector = tfidf_document_vector(index)
    
    return f_tf(query_vector, doc_vector)

def f_bm25(query_vector, doc_vector, k):
    rec = {}
    for doc_id in range(len(doc_vector)):
        sim = 0
        vector = doc_vector[doc_id]
        for i in range(len(vector)):
            if vector[i] != 0:
                y = (k+1) * query_vector[i]
                dom = (query_vector[i] * y)/(query_vector[i]+k)
                sim += (dom * math.log10((documents + 1)/vector[i]))
        rec[doc_id+1] = sim
    return rec
    

def bm25_vsm(index, query, k):
    query_vector = tf_query_vector(index, query)   
    doc_vector = tf_document_vector(index)
    
    return f_bm25(query_vector, doc_vector, k)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/gabrielvinha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gabrielvinha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
chosen_one = 13
doc_url = "https://brasil.elpais.com/brasil/2019/03/15/cultura/1552681746_926411.html"
query = "Gabo colombiano solidão"

def reciprocal_rank(rank, selected_doc):
    position = 0
    for index, row in rank.iterrows():
        position += 1
        if row["document"] == chosen_one:
            return 1.0/position
     
rank_binario = get_top10rank(binary_vsm(index, query))
print(rank_binario.to_string(index=False))
print("Rank:", reciprocal_rank(rank_binario, chosen_one))

 document  score     r
       13      2   1.0
        4      1   2.0
       63      1   3.0
       65      1   4.0
      157      1   5.0
      182      1   6.0
      184      1   7.0
      188      1   8.0
        1      0   9.0
        2      0  10.0
Rank: 1.0


In [17]:
rank_tf = get_top10rank(tf_vsm(index, query))
print(rank_tf.to_string(index=False))
print("Rank:", reciprocal_rank(rank_tf, chosen_one))

 document  score     r
       13      4   1.0
       63      2   2.0
        4      1   3.0
       65      1   4.0
      157      1   5.0
      182      1   6.0
      184      1   7.0
      188      1   8.0
        1      0   9.0
        2      0  10.0
Rank: 1.0


In [18]:
rank_tfidf = get_top10rank(tfidf_vsm(index, query))
print(rank_tfidf.to_string(index=False))
print("Rank:", reciprocal_rank(rank_tfidf, chosen_one))

 document     score     r
       13  8.086610   1.0
       63  3.591760   2.0
        4  2.096910   3.0
      157  1.920819   4.0
      182  1.920819   5.0
      184  1.920819   6.0
       65  1.795880   7.0
      188  1.795880   8.0
        1  0.000000   9.0
        2  0.000000  10.0
Rank: 1.0


In [19]:
rank_bm25 = get_top10rank(bm25_vsm(index, query, 10))
print(rank_bm25.to_string(index=False))
print("Rank:", reciprocal_rank(rank_bm25, chosen_one))

 document     score     r
       13  4.318759   1.0
        4  2.397940   2.0
       65  2.397940   3.0
      157  2.397940   4.0
      182  2.397940   5.0
      184  2.397940   6.0
      188  2.397940   7.0
       63  2.096910   8.0
        1  0.000000   9.0
        2  0.000000  10.0
Rank: 1.0


# Q2

In [20]:
query = "golpe militar"
gabarito = [1, 120, 208]

def map_ri(rank, relevant_docs):
    relevant_documents = 0
    position = 0
    tmp_rank = 0.0
    for index, row in rank.iterrows():
        position += 1
        if row["document"] in relevant_docs:
                relevant_documents += 1
                tmp_rank += 1.0/position
            
    return tmp_rank/relevant_documents

rank_binario = get_top10rank(binary_vsm(index, query))
print(rank_binario.to_string(index=False))
print("MAP:", map_ri(rank_binario, gabarito))

 document  score     r
        1      2   1.0
        2      2   2.0
        3      2   3.0
       25      2   4.0
       83      2   5.0
       99      2   6.0
      114      2   7.0
      120      2   8.0
      151      2   9.0
      165      2  10.0
MAP: 0.5625


# Q3

In [21]:
query = "golpe militar"
### id dos documentos no gabarito da query
gabarito = {1:6, 120:9, 208:5}

def dcg(rank, relevant_docs):
    relevant_documents = 0
    position = 0
    dcg_rank = []
    for index, row in rank.iterrows():
        position += 1
        tmp_rank = 0.0
        for d_id, rel in gabarito.items():
            if d_id == row["document"]:
                if relevant_documents == 0:
                    tmp_rank = rel
                else:
                    tmp_rank += (rel/math.log10(position))
                relevant_documents += 1
        dcg_rank.append(tmp_rank)
                    
            
    return dcg_rank

rank_binario = get_top10rank(binary_vsm(index, query))
rank_binario["DCG"] = dcg(rank_binario, gabarito)
print(rank_binario.to_string(index=False))

 document  score     r       DCG
        1      2   1.0  6.000000
        2      2   2.0  0.000000
        3      2   3.0  0.000000
       25      2   4.0  0.000000
       83      2   5.0  0.000000
       99      2   6.0  0.000000
      114      2   7.0  0.000000
      120      2   8.0  9.965784
      151      2   9.0  0.000000
      165      2  10.0  0.000000


In [22]:
rank_tf = get_top10rank(tf_vsm(index, query))
rank_tf["DCG"] = dcg(rank_tf, gabarito)
print(rank_tf.to_string(index=False))

 document  score     r       DCG
       25     23   1.0  0.000000
        3     15   2.0  0.000000
      208     12   3.0  5.000000
      115      9   4.0  0.000000
        7      8   5.0  0.000000
      165      8   6.0  0.000000
      223      8   7.0  0.000000
        1      7   8.0  6.643856
      166      7   9.0  0.000000
      216      7  10.0  0.000000


In [23]:
rank_tfidf = get_top10rank(tfidf_vsm(index, query))
rank_tfidf["DCG"] = dcg(rank_tfidf, gabarito)
print(rank_tfidf.to_string(index=False))

 document      score     r       DCG
       25  20.980147   1.0  0.000000
        3  13.195359   2.0  0.000000
      208  10.688157   3.0  5.000000
      165   7.455113   4.0  0.000000
      166   7.058946   5.0  0.000000
      223   6.795763   6.0  0.000000
      115   6.532579   7.0  0.000000
        1   6.399596   8.0  6.643856
      230   6.399596   9.0  0.000000
      216   6.069921  10.0  0.000000


In [24]:
rank_bm25 = get_top10rank(bm25_vsm(index, query, 10))
rank_bm25["DCG"] = dcg(rank_bm25, gabarito)
print(rank_bm25.to_string(index=False))

 document     score     r  DCG
      228  4.795880   1.0  0.0
        2  4.494850   2.0  0.0
       99  4.494850   3.0  0.0
      114  4.494850   4.0  0.0
      169  4.494850   5.0  0.0
      120  4.318759   6.0  9.0
      233  4.318759   7.0  0.0
       83  4.193820   8.0  0.0
      151  4.096910   9.0  0.0
      166  4.017729  10.0  0.0
