In [1]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from collections import Counter, OrderedDict
import math

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/dnrocha1/information_retrieval/master/lab02/data/results.csv')

In [3]:
#pre-process

documents = data['text'].apply(lambda x: x.lower())

regex = RegexpTokenizer(r'\b[A-zÀ-ú-\'\d]{3,}')
# tokens = regex.tokenize(texts)

In [4]:
doc_list = []

def build_index(documents=documents):
  inverted_list = {}

  n_doc = 0
  for document in documents:
    n_doc += 1
    doc_list.append(n_doc)
    token = regex.tokenize(document)
    counter = list(Counter(token).items())
    for elem in counter:
      key = elem[0]
      freq = elem[1]
      if key in inverted_list.keys():
        if n_doc not in inverted_list[key][0]:
          inverted_list[key].append((n_doc,freq))
      else:
        inverted_list[key] = [(n_doc,freq)]
  
  return inverted_list

index = build_index()

In [5]:
%%capture
def mutual_info(word,index=index):
  
  ranking = {}
  docs_word = set(map(lambda x: x[0], index[word]))
  n_a = len(docs_word)
  
  for key in index:
    if key != word:
      docs_b = set(map(lambda x: x[0], index[key]))
      n_b = len(docs_b)
      n_ab = len(docs_word & docs_b)
      if n_ab == 0: continue
      ranking[key] = n_ab/(n_a*n_b)
  
  return OrderedDict(sorted(ranking.items(), key=lambda x: x[1], reverse=True))
  
mutual_info("governo")

In [6]:
%%capture
def expected_mutual_info(word,index=index):
  
  ranking = {}
  docs_word = set(map(lambda x: x[0], index[word]))
  n_a = len(docs_word)
  n = len(documents)
  
  for key in index:
    if key != word:
      docs_b = set(map(lambda x: x[0], index[key]))
      n_b = len(docs_b)
      n_ab = len(docs_word & docs_b)
      if n_ab == 0: continue
      ranking[key] = n_ab * math.log(n*(n_ab/(n_a*n_b)))
  
  return OrderedDict(sorted(ranking.items(), key=lambda x: x[1], reverse=True))
  
expected_mutual_info("bolsonaro")

In [7]:
%%capture
def chi_square(word,index=index):
  
  ranking = {}
  docs_word = set(map(lambda x: x[0], index[word]))
  n_a = len(docs_word)
  n = len(documents)
  
  for key in index:
    if key != word:
      docs_b = set(map(lambda x: x[0], index[key]))
      n_b = len(docs_b)
      n_ab = len(docs_word & docs_b)
      if n_ab == 0: continue
      ranking[key] = (n_ab-(1/n)*n_a*n_b)**2 / (n_a*n_b)
  
  return OrderedDict(sorted(ranking.items(), key=lambda x: x[1], reverse=True))
  
chi_square("bolsonaro")

In [8]:
%%capture
def dice(word,index=index):
  
  ranking = {}
  docs_word = set(map(lambda x: x[0], index[word]))
  n_a = len(docs_word)
  
  for key in index:
    if key != word:
      docs_b = set(map(lambda x: x[0], index[key]))
      n_b = len(docs_b)
      n_ab = len(docs_word & docs_b)
      if n_ab == 0: continue
      ranking[key] = n_ab/(n_a+n_b)
  
  return OrderedDict(sorted(ranking.items(), key=lambda x: x[1], reverse=True))
  
dice("governo")