## Parte 1: Busqueda Lineal de documentos

In [87]:
from unittest import result

import pandas as pd
from numpy.ma.core import append

path = 'data/'
df = pd.read_csv(path + 'IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


* Limpiar la data

In [88]:
import re
def clean_text(doc):
    doc = re.sub(pattern=r'<.*?>', repl='', string=doc)
    doc = re.sub(pattern=r'[^A-Za-z0-9\-\']+', repl=' ', string=doc)
    doc = doc.lower()
    return re.sub(r'\s+', ' ', doc).strip()

In [89]:
df_clean = df['review'].apply(clean_text)

* tokenizar palabras

In [None]:
def tokenizar(review):
    return review.split()
# print(tokenizar(df_clean[5]))

* Busqueda lineal

In [165]:
def busqueda_lineal(doc, query):
    result = []
    for i, review in doc.items():
        tokens = tokenizar(review)
        for token in tokens:
            if query == token and i not in result:
                result.append(i)
    return result

In [185]:
query = 'other'
result = busqueda_lineal(df_clean, query)
print(len(result))
print(result)

13298
[0, 13, 15, 17, 18, 22, 25, 31, 32, 40, 41, 43, 44, 48, 58, 59, 60, 66, 70, 71, 74, 75, 83, 93, 99, 101, 105, 113, 118, 120, 123, 125, 130, 131, 133, 135, 142, 149, 152, 153, 158, 161, 164, 167, 170, 174, 178, 179, 183, 191, 195, 198, 206, 215, 220, 223, 225, 244, 246, 249, 254, 260, 267, 269, 273, 275, 295, 298, 300, 309, 314, 320, 322, 325, 327, 332, 334, 338, 350, 352, 353, 355, 357, 362, 364, 365, 366, 369, 370, 371, 372, 373, 378, 384, 395, 397, 400, 401, 402, 407, 417, 419, 420, 424, 426, 428, 429, 431, 433, 436, 437, 438, 440, 442, 444, 453, 454, 455, 456, 457, 458, 461, 462, 464, 466, 470, 472, 487, 488, 489, 494, 499, 507, 508, 510, 518, 519, 520, 524, 540, 543, 547, 552, 556, 557, 558, 559, 560, 561, 564, 565, 566, 572, 575, 579, 581, 590, 596, 597, 605, 608, 609, 611, 615, 622, 627, 631, 635, 637, 639, 641, 646, 661, 664, 669, 672, 673, 674, 678, 681, 682, 688, 691, 692, 693, 696, 699, 702, 705, 708, 710, 712, 720, 725, 726, 731, 734, 735, 743, 745, 753, 754, 765, 769,

## Parte 2: Contruccion de un indice invertido

* crear indice invertido

In [172]:
from collections import defaultdict
def inverted_index(doc):
    inverted_index = defaultdict(set)
    for i, review in doc.items():
        tokens = tokenizar(review)
        for token in tokens:
            inverted_index[token].add(i)
    return inverted_index

In [178]:
inv_idx = inverted_index(df_clean)

In [175]:
first_100 = df_clean.head(100)
inv_idx100 = inverted_index(first_100)

In [186]:
query = 'other'
print(len(inv_idx[query]))
print(inv_idx[query])


13298
{0, 32769, 32771, 32774, 32775, 32776, 32779, 13, 15, 17, 18, 32789, 22, 25, 32797, 31, 32, 32799, 32800, 32802, 32803, 32807, 40, 41, 32809, 43, 44, 32812, 48, 32817, 32819, 32823, 58, 59, 60, 32827, 32828, 32830, 32831, 66, 32836, 32837, 70, 71, 32838, 32840, 74, 75, 32841, 32842, 32843, 83, 32852, 32856, 32858, 93, 32861, 32863, 32865, 32866, 99, 101, 32869, 105, 32874, 32875, 32878, 113, 32882, 118, 32886, 120, 32888, 123, 125, 32896, 32897, 130, 131, 32900, 133, 135, 32905, 32906, 32907, 32908, 142, 32910, 149, 32919, 152, 153, 32920, 32921, 32924, 158, 161, 164, 167, 32937, 170, 174, 32943, 178, 179, 32947, 32948, 32950, 183, 191, 32960, 32962, 195, 198, 32968, 32972, 206, 32975, 32980, 215, 220, 223, 225, 32993, 32997, 32999, 33011, 244, 246, 33016, 249, 33021, 254, 33023, 33024, 260, 33028, 267, 33036, 269, 273, 33041, 275, 33047, 33052, 33054, 33057, 33061, 295, 298, 33067, 300, 33068, 33069, 33074, 33075, 33076, 309, 33080, 314, 33086, 320, 33088, 322, 33092, 325, 33093

## Parte 3: Evaluacion de tiempos de busqueda

In [195]:
import time
def evaluar_tiempo():
    query = 'other'
    tiempos = {}

    ### Busqueda lineal
    inicio1 = time.perf_counter(); busqueda_lineal(df_clean, query); fin1 = time.perf_counter()
    tiempos['busqueda lineal'] = fin1 - inicio1

    ### Indice invertido
    inicio2 = time.perf_counter(); inv_idx[query]; fin2 = time.perf_counter()
    tiempos['indice invertido'] = fin2 - inicio2
    return tiempos

In [196]:
evaluar_tiempo()

{'busqueda lineal': 1.5248201999347657,
 'indice invertido': 2.299901098012924e-06}