In [1]:
import gensim
from gensim import corpora,models
from gensim.models import LdaModel
from gensim.parsing.preprocessing import remove_stopwords,strip_punctuation, strip_numeric,strip_short
import pandas as pd
import unidecode
import csv
import datetime as dt

import pyLDAvis
import pyLDAvis.gensim  # don't skip this

import matplotlib.pyplot as plt

In [2]:
filename = "amazon_reviews_us_Beauty_v1_00.tsv"
url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz'

a = [] 
with open(filename) as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t' )
    for row in reader:
        a.append(row)
        break

In [3]:
fdf = pd.read_csv(filename, delimiter='\t', names=a[0], skiprows=[0])
mask = fdf.isnull().sum(axis=1) != 7
fdf = fdf[mask]
fdf["review_date"] = pd.to_datetime(fdf['review_date'],format = "%Y-%m-%d",errors='coerce')
fdf.dropna(inplace=True)
fdf["year"] = fdf["review_date"].map(lambda x: x.year)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
year_count = fdf.groupby('year')['review_id'].nunique()

In [5]:
year_count = year_count.to_dict()
year_count

{2000: 33,
 2001: 257,
 2002: 567,
 2003: 1252,
 2004: 3604,
 2005: 7889,
 2006: 11601,
 2007: 28486,
 2008: 39393,
 2009: 57243,
 2010: 95308,
 2011: 180718,
 2012: 342130,
 2013: 919622,
 2014: 1604463,
 2015: 1801230}

In [10]:
#Partition logic
#30000 random entries per year

years = list(year_count.keys())
years.remove(2000)
df_list = []
thresh = 30000
for year in years:
    temp_df = fdf[fdf["year"] == year]
    if (year_count[year] > thresh):
        temp_df = temp_df.sample(thresh)
    df_list.append(temp_df)    


concat_dfs = []
for i in range(5):
    concat_dfs.append(pd.concat(df_list[3 * i: 3 * i + 3]))

In [28]:
def preprocess(doc):
    return(strip_short(remove_stopwords(strip_numeric(strip_punctuation(doc.lower()))),3).split())

def lda_modeling_df(df):
    tags = [tag for tag in df["review_body"]]
    corpus = [preprocess(tag) for tag in tags]
    dictionary = corpora.Dictionary(corpus)
    corpus = [dictionary.doc2bow(preprocess(tag)) for tag in tags]
    dictionary.filter_extremes(no_below=2, no_above=0.99)
    corpus = [dictionary.doc2bow(preprocess(tag)) for tag in tags]

    lda_model = LdaModel(corpus=corpus,  # This code runs your lda
                             id2word=dictionary, 
                             random_state=100, 
                             num_topics=15,
                             passes=5,
                             chunksize=10000,
                             alpha='asymmetric',
                             decay=0.5,
                             offset=64,
                             eta=None,
                             eval_every=0,
                             iterations=100,
                             gamma_threshold=0.001,
                             per_word_topics=True)
    return lda_model, corpus, dictionary

def print_lda_models(lda_model, dictionary):
    for i in range(15):
        words = lda_model.get_topic_terms(i, topn=10)
        print("Topic : " + str(i))
        for i in words:
            print("Word: " + str(dictionary[i[0]]) + "\t\t Weight: " + str(i[1])) 
        print("\n")

In [22]:
lda_model, corpus, dictionary = lda_modeling_df(concat_dfs[0])
print_lda_models(lda_model, dictionary)

lda_model.log_perplexity(corpus)
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

Topic : 0
Word: hair		 Weight: 0.02796978
Word: product		 Weight: 0.013069683
Word: use		 Weight: 0.012739787
Word: like		 Weight: 0.009500651
Word: time		 Weight: 0.008429473
Word: great		 Weight: 0.007223496
Word: good		 Weight: 0.006054234
Word: quot		 Weight: 0.0050058234
Word: little		 Weight: 0.0049772053
Word: dryer		 Weight: 0.0046792566


Topic : 1
Word: great		 Weight: 0.0061771027
Word: use		 Weight: 0.0035128507
Word: teeth		 Weight: 0.0030803494
Word: water		 Weight: 0.0026960003
Word: like		 Weight: 0.0025172052
Word: mouth		 Weight: 0.0024244657
Word: toothbrush		 Weight: 0.0022894498
Word: works		 Weight: 0.0021334735
Word: brush		 Weight: 0.0019598796
Word: clean		 Weight: 0.0018761345


Topic : 2
Word: shave		 Weight: 0.0035869174
Word: razor		 Weight: 0.0028521158
Word: use		 Weight: 0.00217578
Word: close		 Weight: 0.0014665622
Word: razors		 Weight: 0.0013609072
Word: skin		 Weight: 0.0011674041
Word: blade		 Weight: 0.0011034914
Word: years		 Weight: 0.0010959306


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [23]:
lda_model2, corpus2, dictionary2 = lda_modeling_df(concat_dfs[1])
print_lda_models(lda_model2, dictionary2)

lda_model2.log_perplexity(corpus2)
vis2 = pyLDAvis.gensim.prepare(lda_model2, corpus2, dictionary2)
pyLDAvis.display(vis2)

Topic : 0
Word: skin		 Weight: 0.03076669
Word: product		 Weight: 0.014369206
Word: like		 Weight: 0.014217184
Word: use		 Weight: 0.011932023
Word: products		 Weight: 0.0095928665
Word: love		 Weight: 0.008625467
Word: smell		 Weight: 0.00844801
Word: great		 Weight: 0.008408452
Word: cream		 Weight: 0.008285627
Word: face		 Weight: 0.0072018844


Topic : 1
Word: shave		 Weight: 0.0116998255
Word: razor		 Weight: 0.010645888
Word: product		 Weight: 0.010406699
Word: use		 Weight: 0.010015616
Word: shaver		 Weight: 0.009596504
Word: like		 Weight: 0.009523407
Word: good		 Weight: 0.008032816
Word: time		 Weight: 0.007822371
Word: great		 Weight: 0.0075393897
Word: shaving		 Weight: 0.0061754473


Topic : 2
Word: radio		 Weight: 0.009018258
Word: station		 Weight: 0.0056025744
Word: time		 Weight: 0.0029351301
Word: reception		 Weight: 0.0029078657
Word: good		 Weight: 0.0027179571
Word: great		 Weight: 0.0023866398
Word: like		 Weight: 0.0022987237
Word: product		 Weight: 0.0021035138


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [24]:
lda_model3, corpus3, dictionary3 = lda_modeling_df(concat_dfs[2])
print_lda_models(lda_model3, dictionary3)

lda_model3.log_perplexity(corpus3)
vis3 = pyLDAvis.gensim.prepare(lda_model3, corpus3, dictionary3)
pyLDAvis.display(vis3)

Topic : 0
Word: product		 Weight: 0.014355447
Word: use		 Weight: 0.013695958
Word: hair		 Weight: 0.011133545
Word: dryer		 Weight: 0.008316954
Word: like		 Weight: 0.0074940454
Word: time		 Weight: 0.0070615676
Word: great		 Weight: 0.006885495
Word: work		 Weight: 0.0066206716
Word: good		 Weight: 0.005878245
Word: works		 Weight: 0.0056024943


Topic : 1
Word: product		 Weight: 0.017719502
Word: scent		 Weight: 0.014456585
Word: like		 Weight: 0.014357634
Word: great		 Weight: 0.011915524
Word: love		 Weight: 0.011058695
Word: smell		 Weight: 0.011006232
Word: amazon		 Weight: 0.0103687225
Word: good		 Weight: 0.0088149905
Word: price		 Weight: 0.007973697
Word: perfume		 Weight: 0.007132872


Topic : 2
Word: hair		 Weight: 0.09304772
Word: product		 Weight: 0.024037013
Word: use		 Weight: 0.014717935
Word: great		 Weight: 0.011421356
Word: like		 Weight: 0.011280172
Word: shampoo		 Weight: 0.010501885
Word: dry		 Weight: 0.009015254
Word: iron		 Weight: 0.008812454
Word: condition

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [25]:
lda_model4, corpus4, dictionary4 = lda_modeling_df(concat_dfs[3])
print_lda_models(lda_model4, dictionary4)

lda_model4.log_perplexity(corpus4)
vis4 = pyLDAvis.gensim.prepare(lda_model4, corpus4, dictionary4)
pyLDAvis.display(vis4)

Topic : 0
Word: skin		 Weight: 0.0381472
Word: product		 Weight: 0.02549241
Word: use		 Weight: 0.016968166
Word: face		 Weight: 0.013906927
Word: like		 Weight: 0.011693969
Word: great		 Weight: 0.008042677
Word: cream		 Weight: 0.00772883
Word: dry		 Weight: 0.007617372
Word: love		 Weight: 0.0071109254
Word: products		 Weight: 0.0070954235


Topic : 1
Word: color		 Weight: 0.018652692
Word: like		 Weight: 0.011917157
Word: use		 Weight: 0.010065471
Word: great		 Weight: 0.009927033
Word: product		 Weight: 0.009200438
Word: nail		 Weight: 0.008946049
Word: colors		 Weight: 0.008522354
Word: nails		 Weight: 0.007944044
Word: time		 Weight: 0.007632467
Word: love		 Weight: 0.0075391983


Topic : 2
Word: brush		 Weight: 0.043739084
Word: brushes		 Weight: 0.019843971
Word: great		 Weight: 0.016920678
Word: good		 Weight: 0.015050183
Word: use		 Weight: 0.014250283
Word: soap		 Weight: 0.013095092
Word: set		 Weight: 0.011175428
Word: size		 Weight: 0.011067422
Word: quality		 Weight: 0.

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [26]:
lda_model5, corpus5, dictionary5 = lda_modeling_df(concat_dfs[4])
print_lda_models(lda_model5, dictionary5)

lda_model5.log_perplexity(corpus5)
vis5 = pyLDAvis.gensim.prepare(lda_model5, corpus5, dictionary5)
pyLDAvis.display(vis5)

Topic : 0
Word: great		 Weight: 0.04429684
Word: product		 Weight: 0.04185186
Word: love		 Weight: 0.036994975
Word: good		 Weight: 0.029183166
Word: price		 Weight: 0.018750405
Word: smell		 Weight: 0.012040591
Word: buy		 Weight: 0.011778903
Word: scent		 Weight: 0.011645627
Word: works		 Weight: 0.010448836
Word: nice		 Weight: 0.009770115


Topic : 1
Word: use		 Weight: 0.014295385
Word: razor		 Weight: 0.011384797
Word: like		 Weight: 0.010052237
Word: shave		 Weight: 0.009758344
Word: time		 Weight: 0.0076453662
Word: good		 Weight: 0.007509365
Word: great		 Weight: 0.0072081066
Word: works		 Weight: 0.0062778434
Word: shaving		 Weight: 0.006107851
Word: better		 Weight: 0.005973552


Topic : 2
Word: like		 Weight: 0.023028804
Word: color		 Weight: 0.022893257
Word: product		 Weight: 0.013636006
Word: love		 Weight: 0.011014316
Word: hair		 Weight: 0.010543395
Word: great		 Weight: 0.009501235
Word: use		 Weight: 0.008869348
Word: look		 Weight: 0.008681431
Word: little		 Weight:

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [27]:
flda_model, fcorpus, fdictionary = lda_modeling_df(pd.concat(concat_dfs))
print_lda_models(flda_model, fdictionary)

flda_model.log_perplexity(fcorpus)
fvis = pyLDAvis.gensim.prepare(flda_model, fcorpus, fdictionary)
pyLDAvis.display(fvis)

Topic : 0
Word: product		 Weight: 0.054562226
Word: great		 Weight: 0.02788164
Word: price		 Weight: 0.024358567
Word: good		 Weight: 0.024081694
Word: buy		 Weight: 0.011310557
Word: amazon		 Weight: 0.0111029595
Word: received		 Weight: 0.010799388
Word: quality		 Weight: 0.009927587
Word: item		 Weight: 0.009921778
Word: order		 Weight: 0.009825662


Topic : 1
Word: like		 Weight: 0.014900625
Word: nails		 Weight: 0.014369641
Word: nail		 Weight: 0.013992579
Word: polish		 Weight: 0.013034503
Word: dryer		 Weight: 0.012642407
Word: great		 Weight: 0.012496991
Word: use		 Weight: 0.01198698
Word: love		 Weight: 0.011822145
Word: iron		 Weight: 0.011656356
Word: hot		 Weight: 0.008641853


Topic : 2
Word: love		 Weight: 0.057035357
Word: years		 Weight: 0.028383577
Word: bought		 Weight: 0.024402732
Word: amazon		 Weight: 0.02095829
Word: loves		 Weight: 0.016169636
Word: product		 Weight: 0.015292991
Word: great		 Weight: 0.014992834
Word: stores		 Weight: 0.014721635
Word: store		 W

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
