In [36]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
path= './data/'
columnas = ['Score', 'Title', 'Review']
df=pd.read_csv(path+'AmazonReview.zip', compression='zip', header=None, names=columnas)
df

Unnamed: 0,Score,Title,Review
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...
...,...,...,...
399995,1,Unbelievable- In a Bad Way,We bought this Thomas for our son who is a hug...
399996,1,"Almost Great, Until it Broke...",My son recieved this as a birthday gift 2 mont...
399997,1,Disappointed !!!,"I bought this toy for my son who loves the ""Th..."
399998,2,Classic Jessica Mitford,This is a compilation of a wide range of Mitfo...


In [6]:
def clean_doc(doc):
    doc = re.sub(pattern=r'<.*?>', repl='', string=doc)
    doc = re.sub(pattern=r'[^A-Za-z0-9\-\']+', repl=' ', string=doc)
    doc = doc.lower()
    return re.sub(r'\s+', ' ', doc).strip()
def tokenizar(review):
    return review.split()

In [7]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def aplicar_stemming(tokens):
    return [stemmer.stem(token) for token in tokens]

def aplicar_lematizacion(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]


In [8]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asus12\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\asus12\AppData\Roaming\nltk_data...


True

In [9]:
df_clean = df["Review"].apply(clean_doc)

In [87]:
df_10 = df_clean.head(50000)

In [88]:
df_stem = df_10.apply(lambda x: aplicar_stemming(tokenizar(x)))

In [89]:
df_stem

0        [my, love, pat, ha, one, of, the, great, voic,...
1        [despit, the, fact, that, i, have, onli, play,...
2        [i, bought, thi, charger, in, jul, 2003, and, ...
3        [check, out, maha, energy', websit, their, pow...
4        [review, quit, a, bit, of, the, combo, player,...
                               ...                        
49995    [hollywood, should, have, left, the, origin, v...
49996    [it, wa, a, poor, copi, although, it, came, sh...
49997    [the, best, part, of, thi, movi, is, that, it'...
49998    [though, pfeiffer, latifah, simpli, continu, t...
49999    [i, didn't, think, i'd, be, interest, in, thi,...
Name: Review, Length: 50000, dtype: object

In [90]:
df_lemm = df_10.apply(lambda x: aplicar_lematizacion(tokenizar(x)))

In [91]:
df_lemm

0        [my, lovely, pat, ha, one, of, the, great, voi...
1        [despite, the, fact, that, i, have, only, play...
2        [i, bought, this, charger, in, jul, 2003, and,...
3        [check, out, maha, energy's, website, their, p...
4        [reviewed, quite, a, bit, of, the, combo, play...
                               ...                        
49995    [hollywood, should, have, left, the, original,...
49996    [it, wa, a, poor, copy, although, it, came, sh...
49997    [the, best, part, of, this, movie, is, that, i...
49998    [though, pfeiffer, latifah, simply, continue, ...
49999    [i, didn't, think, i'd, be, interested, in, th...
Name: Review, Length: 50000, dtype: object

In [24]:
def construir_indice_invertido(col_tokens):
    indice = defaultdict(dict)
    for doc_id, tokens in col_tokens.items():
        contador = Counter(tokens)  # frecuencia de términos en este documento
        for termino, freq in contador.items():
            indice[termino][doc_id] = freq
    return dict(indice)

In [25]:
invertedIndex_lemma = construir_indice_invertido(df_lemm)

In [26]:
invertedIndex_stem = construir_indice_invertido(df_stem)

In [95]:
# unir el df_lemm en un texto seguido
df_lemm_text = df_lemm.apply(lambda tokens: " ".join(tokens))

In [96]:
df_lemm_text

0        my lovely pat ha one of the great voice of her...
1        despite the fact that i have only played a sma...
2        i bought this charger in jul 2003 and it worke...
3        check out maha energy's website their powerex ...
4        reviewed quite a bit of the combo player and w...
                               ...                        
49995    hollywood should have left the original versio...
49996    it wa a poor copy although it came shrink wrap...
49997    the best part of this movie is that it's event...
49998    though pfeiffer latifah simply continue to mai...
49999    i didn't think i'd be interested in this movie...
Name: Review, Length: 50000, dtype: object

## TF-IDF

In [83]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_lemm_text)

In [84]:
def procesar_query(query):
    limpio = clean_doc(query)
    tokens = tokenizar(limpio)
    lemmas = aplicar_lematizacion(tokens)
    return " ".join(lemmas)

def buscar(query, top_k=5):
    query_proc = procesar_query(query)
    query_vec = vectorizer.transform([query_proc])
    sims = cosine_similarity(query_vec, tfidf_matrix)[0]
    ranking = sims.argsort()[::-1][:top_k]
    return [(i, sims[i]) for i in ranking]

In [104]:
resultados = buscar("useless battery", top_k=5)
resultados

[(np.int64(4951), np.float64(0.4934382672942171)),
 (np.int64(3890), np.float64(0.465599611336124)),
 (np.int64(7540), np.float64(0.4167175029683224)),
 (np.int64(7138), np.float64(0.36757200036323967)),
 (np.int64(4953), np.float64(0.3600252545251739))]

In [105]:
df_lemm_text[4951]

'i bought a new battery but it is behaving like a used battery the battery live doe not last more than 2 day when not in use'

In [106]:
df_lemm_text[3890]

"don't waste time or money on this garbage - you can't mount it and the replacement battery is very expensive useless"