In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import gensim

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
def pre_process(json_file):
    df = pd.read_json(json_file, orient='records', lines=True)

    # remove 3 reviews
    df_no3 = df[df['overall'].isin([1,2,4,5])]

    dict_class = {
    1 : 0,
    2 : 0,
    4 : 1,
    5 : 1
    }

    # map reviews to sentiment classification
    df_no3['sentiment'] = df_no3['overall'].map(dict_class)
    df_no3['rev_sum'] = df_no3['summary'] + ' ' + df_no3['reviewText']
    # df_no3.head(3)

    # get only relevant columns
    df_games = pd.DataFrame()
    return df_no3[['rev_sum', 'sentiment']]

In [3]:
pd.set_option('mode.chained_assignment', None) # turn off warning

df_dm = pre_process('Digital_Music_5.json')
df_vg = pre_process('Video_Games_5.json')
df_ac = pre_process('Arts_Crafts_and_Sewing_5.json')

In [4]:
df_dm.iloc[:10,0]

0    Slayer Rules! This is awesome to listen to, A ...
1                                      Five Stars bien
2    SLAYER!!!!!!!!!!!!!!!!!!!!! It was great to he...
3    slayer greatest hits! you mean everything righ...
4    This is a good, blessing filled What can I say...
5      Four Stars Enjoy Casting Crowns and their songs
6    Can't say enough.  Great Christian music.  God...
7    DEFINITELY DESERVES PERFECT STARS!!!! I love t...
8    Can't go wrong with Casting Crowns This is an ...
9    Great music, but even better if you see them l...
Name: rev_sum, dtype: object

In [5]:
from nltk.tokenize import RegexpTokenizer

def make_full_corpus(df, n):
    big_string = ' '.join(df.iloc[:n,0].astype(str))
    return big_string
    # regex_tok = RegexpTokenizer('[a-z]\w+')
    # return regex_tok.tokenize(big_string.lower())


# dictionary = gensim.corpora.Dictionary(gen_docs)

In [42]:
tokens_dm = make_full_corpus(df_dm, 10)
tokens_vg = make_full_corpus(df_vg, 10)
tokens_ac = make_full_corpus(df_ac, 10)

In [43]:
def cosine_sim(str1, str2):
    corpus = [str1, str2]

    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(stop_words='english', token_pattern='[a-z]\w+')
    trsfm=vectorizer.fit_transform(corpus)

    from sklearn.metrics.pairwise import cosine_similarity
    return cosine_similarity(trsfm[0:1], trsfm)[0][1]

In [44]:
cosine_sim(tokens_dm, tokens_vg), cosine_sim(tokens_dm, tokens_ac) 

(0.06429631963438541, 0.08564034616678716)

In [7]:
corpus = [tokens_dm, tokens_vg]

from here: https://medium.com/geekculture/how-to-compare-two-strings-using-sklearns-tdidfvectorizer-and-cosine-similarity-21e8b42371be

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = TfidfVectorizer() # much higher cosine similarity without removing stop words 
# vectorizer = TfidfVectorizer(stop_words='english')
vectorizer = TfidfVectorizer(stop_words='english', token_pattern='[a-z]\w+')
trsfm=vectorizer.fit_transform(corpus)
pd.DataFrame(trsfm.toarray(),columns=vectorizer.get_feature_names_out(),index=['dm_doc','vg_doc'])

Unnamed: 0,a10,aa,aaa,aaaa,aahhh,abandon,abandoned,abc,abdul,abhout,...,zillion,zing,zip,zl7xqwcas,zombie,zombieland,zomg,zone,zoom,zora
dm_doc,0.0,0.0,0.0,0.000827,0.0,0.000588,0.0,0.003308,0.001654,0.0,...,0.000588,0.0,0.0,0.0,0.0,0.000827,0.0,0.002354,0.0,0.0
vg_doc,0.001001,0.000501,0.000501,0.0,0.000501,0.000356,0.001001,0.0,0.0,0.000501,...,0.000356,0.000501,0.000501,0.000501,0.001001,0.0,0.000501,0.006769,0.000501,0.000501


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

dm_vec = trsfm[0:1].toarray().tolist()
vg_vec = trsfm[1:2].toarray().tolist()

cosine_similarity(trsfm[0:1], trsfm)[0][1]

0.2812048829317365

In [20]:
from scipy.special import kl_div

#calculate (P || Q)
kl_div(dm_vec, vg_vec)

array([[0.00100143, 0.00050071, 0.00050071, ..., 0.00192906, 0.00050071,
        0.00050071]])

In [11]:
# from https://www.statology.org/kl-divergence-python/
from scipy.special import rel_entr
P = [.05, .1, .2, .05, .15, .25, .08, .12]
Q = [.3, .1, .2, .1, .1, .02, .08, .1]

#calculate (P || Q)
sum(rel_entr(P, Q))

0.589885181619163