# Lab01 - Busca Booleana

## Indice invertido

O método a seguir permite construir um indice invertido.

In [8]:
import nltk as n

inverted_index = {}
def build_inverted_index(data):
    data["noticia"] = data.titulo + " " + data.conteudo
    for index, doc in data.iterrows():
        doc.noticia = doc.noticia.lower()
        words = [str(word) for word in n.word_tokenize(doc.noticia)]
        for word in words:
            inverted_index.setdefault(word, []).append(doc.idNoticia)

    return inverted_index

## Implementação dos métodos de busca

Criando um método de busca genérico podendo receber pesquisas com apenas um termo, dois termos com *and* ou com *or*.

In [9]:
def search(term):
    term = n.word_tokenize(term)

    if len(term) == 1:
        return _search_one_term(term[0])
    elif len(term) == 3 and term[1].upper() == "AND":
        return _search_and(term[0], term[2])
    elif len(term) == 3 and term[1].upper() == "OR":
        return _search_or(term[0], term[2])

Método de busca com apenas um termo.

In [10]:
def _search_one_term(term):
    if inverted_index.has_key(term.lower()):
        return sorted(list(set(inverted_index[term.lower()])))
    else:
        return None

Metodo de busca *and* com apenas dois termos.

In [11]:
def _search_and(first_term, second_term):
    if inverted_index.has_key(first_term.lower()) and inverted_index.has_key(second_term.lower()):
        return sorted(list(set(inverted_index[first_term.lower()]) & set(inverted_index[second_term.lower()])))
    else:
        return None

Metodo de busca *or* com apenas dois termos.

In [12]:
def _search_or(first_term, second_term):
    if inverted_index.has_key(first_term.lower()) and inverted_index.has_key(second_term.lower()):
        return sorted(list(set(inverted_index[first_term.lower()]) | set(inverted_index[second_term.lower()])))
    elif inverted_index.has_key(first_term.lower()):
        return sorted(list(set(inverted_index[first_term.lower()])))
    elif inverted_index.has_key(second_term.lower()):
        return sorted(list(set(inverted_index[second_term.lower()])))
    else:
        return None

## Importação do data-set

In [13]:
import pandas as p
import sys
reload(sys)
sys.setdefaultencoding('utf8')

data = p.read_csv("data-set/noticias_estadao.csv", encoding = "utf-8")

## Chamada ao método de contrução de indice invertido


In [14]:
build_inverted_index(data)

{'sucateamento': [2364, 4979, 3449, 6425, 3415, 4],
 'obrigaria': [5516, 7492],
 'mizael': [202],
 'bloqueou': [2636, 6114, 6114, 1500, 136, 4421, 3445, 7337],
 'explodindo': [6548, 3026],
 'espa\xc3\xa7os': [6426,
  1847,
  1859,
  4828,
  7563,
  7563,
  858,
  858,
  6940,
  1985,
  3574,
  3631,
  3631,
  6063,
  7627,
  4241,
  1788,
  5012,
  5012,
  7365,
  4601,
  4601,
  4601,
  3411,
  3311,
  3757,
  2094,
  4622,
  7136,
  7141,
  7141,
  7071,
  7071,
  6568,
  3666,
  2555,
  3827,
  7516,
  6984,
  7317,
  7317,
  6180,
  334,
  967,
  235,
  235,
  6856,
  4861,
  6617,
  6619,
  1023,
  5652,
  5652,
  6789,
  4616,
  1102,
  4087,
  5985,
  7443,
  7443,
  3276,
  4140,
  2241,
  2239,
  4263,
  884,
  6555,
  140,
  4874,
  4103,
  792,
  6316,
  2061,
  380,
  111,
  6720,
  2575,
  7434,
  7434,
  374,
  374,
  1009,
  879,
  6770,
  6518,
  1265,
  592,
  1586,
  5044,
  2922,
  3349,
  47,
  751,
  6588,
  5447,
  817,
  249,
  1140,
  5456,
  5456,
  4426,
  673

## Sanity Check

In [15]:
assert len(search("Campina AND Grande")) == 12

In [16]:
assert len(search("Campina OR Grande")) == 1656

In [17]:
assert search("Campina AND Grande") == [1068, 1370, 1770, 1952, 1987, 2763, 2777, 2779, 4802, 5382, 5870, 6694]

Para validar um busca *or* podemos usar a seguinte propriedade de operação de conjuntos

![OR](https://njadnrfof9.execute-api.us-east-1.amazonaws.com/prod/svg?tex=%7CA%5C%3A%5Ccup%5C%3AB%7C%5C%3A%3D%5C%3A%7CA%7C%5C%3A%2B%5C%3A%7CB%7C%5C%3A%5C%3A-%5C%3A%7CA%5Ccap%20B%7C)

In [18]:
assert len(search("Campina OR Grande")) == len(search("Campina")) + len(search("Grande")) - len(search("Campina AND Grande"))

## Exemplos de execução

debate *AND* presidencial

In [19]:
assert len(search("debate AND presidencial")) == 201

debate *OR* presidencial

In [20]:
assert len(search("debate OR presidencial")) == 1770

presidenciáveis *AND* corruptos

In [21]:
assert len(search("presidenciáveis AND corruptos")) == 0

presidenciáveis *OR* corruptos

In [22]:
assert len(search("presidenciáveis OR corruptos")) == 164

Belo *OR* Horizonte

In [23]:
assert len(search("Belo OR Horizonte")) == 331

Belo *AND* Horizonte

In [24]:
assert len(search("Belo AND Horizonte")) == 242