In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import gensim

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
def pre_process(json_file):
    df = pd.read_json(json_file, orient='records', lines=True)

    # remove 3 reviews
    df_no3 = df[df['overall'].isin([1,2,4,5])]

    dict_class = {
    1 : 0,
    2 : 0,
    4 : 1,
    5 : 1
    }

    # map reviews to sentiment classification
    df_no3['sentiment'] = df_no3['overall'].map(dict_class)
    df_no3['rev_sum'] = df_no3['summary'] + ' ' + df_no3['reviewText']
    # df_no3.head(3)

    # get only relevant columns
    df_games = pd.DataFrame()
    return df_no3[['rev_sum', 'sentiment']]

In [43]:
pd.set_option('mode.chained_assignment', None) # turn off warning

df_vg = pre_process('Video_Games_5.json')
df_dm = pre_process('Digital_Music_5.json')
df_ac = pre_process('Arts_Crafts_and_Sewing_5.json')

In [17]:
# vg_joint = ' '.join(df_vg['rev_sum'].astype(str))
# dm_joint = ' '.join(df_dm['rev_sum'].astype(str))
# ac_joint = ' '.join(df_ac['rev_sum'].astype(str))

In [38]:
def compare_text(corpus1, corpus2):
    
    # tokenize corpus into sentences
    # list of strings
    first = sent_tokenize(corpus1)
    second = sent_tokenize(corpus2)

    # list of lists, words as tokens for corpus 1
    gen_docs = [[w.lower() for w in word_tokenize(text)] for text in first]

    dictionary = gensim.corpora.Dictionary(gen_docs)
    corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
    tf_idf = gensim.models.TfidfModel(corpus)

    # # building the index
    sims = gensim.similarities.Similarity('workdir/', tf_idf[corpus], num_features=len(dictionary))

    for line in second:
        # list of lists, words as tokens for corpus 2
        query_doc = [w.lower() for w in word_tokenize(line)]

        # update an existing dictionary and create bag of words
        query_doc_bow = dictionary.doc2bow(query_doc)

    query_doc_tf_idf = tf_idf[query_doc_bow]

    sum_of_sims =(np.sum(sims[query_doc_tf_idf], dtype=np.float32))/len(first)
    print(sum_of_sims)

    print('Comparing Result:', sims[query_doc_tf_idf]) 

In [8]:
test = df_vg.head(3)
test['rev_sum'] = test['rev_sum'].apply(word_tokenize)
test_list = list(test['rev_sum'])

In [35]:
def compare_text2(df1, df2):
    
    # tokenize corpus into sentences
    # list of strings
    df1['rev_sum'] = df1['rev_sum'].astype(str).apply(word_tokenize)
    df2['rev_sum'] = df2['rev_sum'].astype(str).apply(word_tokenize)

    # list of lists, words as tokens for corpus 1
    gen_docs = list(df1['rev_sum'])[:100]

    dictionary = gensim.corpora.Dictionary(gen_docs)
    corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
    tf_idf = gensim.models.TfidfModel(corpus)

    # # building the index
    sims = gensim.similarities.Similarity('workdir/', tf_idf[corpus], num_features=len(dictionary))

    # list of lists, words as tokens for corpus 2
    query_doc = list(df2['rev_sum'])[:100]

    # update an existing dictionary and create bag of words
    query_doc_bow = dictionary.doc2bow(query_doc)

    query_doc_tf_idf = tf_idf[query_doc_bow]

    sum_of_sims =(np.sum(sims[query_doc_tf_idf], dtype=np.float32))/len(first)
    print(sum_of_sims)

    print('Comparing Result:', sims[query_doc_tf_idf]) 

In [37]:
# compare_text2(df_dm, df_vg)

In [55]:
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS

def tok(x):
    x = remove_stopwords(x.lower())
    x = word_tokenize(x)
    return x

# s = "This si sentence. ldksajdlksa, so cool, omg, bla bla!"
# tok(s)

In [123]:
df1, df2 = df_dm.head(2), df_dm.head(2)

# tokenize corpus into sentences
# list of strings
df1['rev_sum'] = df1['rev_sum'].astype(str).apply(tok)
df2['rev_sum'] = df2['rev_sum'].astype(str).apply(tok)

In [124]:
# list of lists, words as tokens for corpus 1
gen_docs = list(df1['rev_sum'])[:100]

dictionary = gensim.corpora.Dictionary(gen_docs)
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
tf_idf = gensim.models.TfidfModel(corpus)

In [125]:
# # building the index
sims = gensim.similarities.Similarity('workdir/', tf_idf[corpus], num_features=len(dictionary))

# list of lists, words as tokens for corpus 2
query_doc = list(df2['rev_sum'])[:100]

In [126]:
li = list()
for i in query_doc:
    for j in i:
        li.append(j)

# update an existing dictionary and create bag of words
query_doc_bow = dictionary.doc2bow(li)

query_doc_tf_idf = tf_idf[query_doc_bow]

sum_of_sims =(np.sum(sims[query_doc_tf_idf], dtype=np.float32))/len(gen_docs)
print(sum_of_sims)

print('Comparing Result:', sims[query_doc_tf_idf]) 

0.6077032089233398
Comparing Result: [0.9692234  0.24618298]
