In [36]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
path= './data/'
columnas = ['Score', 'Title', 'Review']
df=pd.read_csv(path+'AmazonReview.zip', compression='zip', header=None, names=columnas)
df

Unnamed: 0,Score,Title,Review
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...
...,...,...,...
399995,1,Unbelievable- In a Bad Way,We bought this Thomas for our son who is a hug...
399996,1,"Almost Great, Until it Broke...",My son recieved this as a birthday gift 2 mont...
399997,1,Disappointed !!!,"I bought this toy for my son who loves the ""Th..."
399998,2,Classic Jessica Mitford,This is a compilation of a wide range of Mitfo...


In [6]:
def clean_doc(doc):
    doc = re.sub(pattern=r'<.*?>', repl='', string=doc)
    doc = re.sub(pattern=r'[^A-Za-z0-9\-\']+', repl=' ', string=doc)
    doc = doc.lower()
    return re.sub(r'\s+', ' ', doc).strip()
def tokenizar(review):
    return review.split()

In [7]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def aplicar_stemming(tokens):
    return [stemmer.stem(token) for token in tokens]

def aplicar_lematizacion(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]


In [1]:
import contractions
def normalized_text(text):
    """
    intenta normalizar el texto, separando las contracciones del ingles.

    Args:
        text: texto que se va a normalizar.

    Returns:
        text: texto normalizado.
    """
    expanded_text = contractions.replace_contractions(text)
    return expanded_text

In [8]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asus12\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\asus12\AppData\Roaming\nltk_data...


True

In [9]:
df_clean = df["Review"].apply(clean_doc)

In [87]:
df_10 = df_clean.head(50000)

In [88]:
df_stem = df_10.apply(lambda x: aplicar_stemming(tokenizar(x)))

In [89]:
df_stem

0        [my, love, pat, ha, one, of, the, great, voic,...
1        [despit, the, fact, that, i, have, onli, play,...
2        [i, bought, thi, charger, in, jul, 2003, and, ...
3        [check, out, maha, energy', websit, their, pow...
4        [review, quit, a, bit, of, the, combo, player,...
                               ...                        
49995    [hollywood, should, have, left, the, origin, v...
49996    [it, wa, a, poor, copi, although, it, came, sh...
49997    [the, best, part, of, thi, movi, is, that, it'...
49998    [though, pfeiffer, latifah, simpli, continu, t...
49999    [i, didn't, think, i'd, be, interest, in, thi,...
Name: Review, Length: 50000, dtype: object

In [90]:
df_lemm = df_10.apply(lambda x: aplicar_lematizacion(tokenizar(x)))

In [91]:
df_lemm

0        [my, lovely, pat, ha, one, of, the, great, voi...
1        [despite, the, fact, that, i, have, only, play...
2        [i, bought, this, charger, in, jul, 2003, and,...
3        [check, out, maha, energy's, website, their, p...
4        [reviewed, quite, a, bit, of, the, combo, play...
                               ...                        
49995    [hollywood, should, have, left, the, original,...
49996    [it, wa, a, poor, copy, although, it, came, sh...
49997    [the, best, part, of, this, movie, is, that, i...
49998    [though, pfeiffer, latifah, simply, continue, ...
49999    [i, didn't, think, i'd, be, interested, in, th...
Name: Review, Length: 50000, dtype: object

In [24]:
def construir_indice_invertido(col_tokens):
    indice = defaultdict(dict)
    for doc_id, tokens in col_tokens.items():
        contador = Counter(tokens)  # frecuencia de términos en este documento
        for termino, freq in contador.items():
            indice[termino][doc_id] = freq
    return dict(indice)

In [25]:
invertedIndex_lemma = construir_indice_invertido(df_lemm)

In [26]:
invertedIndex_stem = construir_indice_invertido(df_stem)

## TF-IDF

In [109]:
# unir el df_lemm en un texto seguido
df_lemm_text = df_lemm.apply(lambda tokens: " ".join(tokens))

In [110]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_lemm_text)

In [147]:
def procesar_query(query):
    limpio = clean_doc(query)
    tokens = tokenizar(limpio)
    lemmas = aplicar_lematizacion(tokens)
    return " ".join(lemmas)

In [127]:
def buscar(query, top_k=5):
    query_proc = procesar_query(query)
    query_vec = vectorizer.transform([query_proc])
    sims = cosine_similarity(query_vec, tfidf_matrix)[0]
    ranking = sims.argsort()[::-1][:top_k]
    return [(i, sims[i]) for i in ranking]

In [163]:
resultados = buscar("rinky-dink", top_k=5)
resultados

[(np.int64(46289), np.float64(0.14540116351465293)),
 (np.int64(18578), np.float64(0.14134383851903443)),
 (np.int64(49998), np.float64(0.0)),
 (np.int64(49997), np.float64(0.0)),
 (np.int64(49996), np.float64(0.0))]

In [153]:
df_lemm_text[46289]

"this charger is not worth a hoot i bought the black decker 3 lawn tool combo it came with a triple battery charger after that i have bought a battery chain saw and a extended hedge trimmer they came with this little rinky-dink battery charger i have had one of them for about 2 year and the other for 3 year and they both went out at the same time black decker should give away the tool because the icad battery don't last and they make a fortune of the price of the useless battery they should make the battery nimh type so you can charge thm at anytime and not have to wear down the battery i get about 15 minute out a battery charge when using my weed trimmer i like the battery operated tool it's just the sorry battery and battery charger i don't like i suggest if you buy any battery operated product that you make sure it come with nimh battery not nicad you can read about the difference between battery on the internet"

In [164]:
df_lemm_text[18578]

'if you look at this it is 3d and appears to be faceted be not decieved it is flat this dissapointment would have been enough for me to score this low but the minus number arise because of the return policy you have to jump through hoopes- slalum climb everest and oh in case you were thinking of sending the item back in a far superior package i e bubble envelope- because you threw out the rinky dinkey rubbish cardbord box that it came in the joke is on you so dumpster diving time what a shame never again'

## Jacard

In [140]:
df_lemm_tokens = df_lemm.apply(set)

In [141]:
df_lemm_tokens

0        {still, thing, wa, never, ha, love, i, jusat, ...
1        {eye, game, purchase, soundtrack, still, sad, ...
2        {get, worked, come, hold, jul, ok, for, would,...
3        {100, check, mh-c204f, they, out, for, website...
4        {replace, special, due, player, cons-, hesitan...
                               ...                        
49995    {drag, hollywood, original, should, energy, ge...
49996    {came, shrink, wa, well, at, did, wrapped, not...
49997    {character, together, could, divine, turned, t...
49998    {status, come, century, becoming, 21st, latifa...
49999    {character, still, ended, completely, wa, neve...
Name: Review, Length: 50000, dtype: object

In [142]:
def jaccard(set1, set2):
    inter = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0.0
    return inter / union


In [143]:
def procesar_query_jaccard(query):
    limpio = clean_doc(query)
    tokens = tokenizar(limpio)
    lemmas = aplicar_lematizacion(tokens)
    return set(lemmas)

In [157]:
def buscar_jaccard(df, query, top_k=5):
    query_set = procesar_query_jaccard(query)
    resultados = []
    for doc_id, doc_set in df.items():
        sim = jaccard(query_set, doc_set)
        resultados.append((doc_id, sim))
    resultados = sorted(resultados, key=lambda x: x[1], reverse=True)
    return resultados[:top_k]

In [158]:
resultados = buscar_jaccard(df_lemm_tokens, "battery useless", top_k=5)
resultados


[(3890, 0.09523809523809523),
 (13871, 0.06666666666666667),
 (31990, 0.06666666666666667),
 (16964, 0.0625),
 (38489, 0.0625)]

In [159]:
df_lemm_text[3890]

"don't waste time or money on this garbage - you can't mount it and the replacement battery is very expensive useless"

## BM25