In [50]:
import gensim
from gensim import corpora,models
from gensim.models import LdaModel
from gensim.parsing.preprocessing import remove_stopwords,strip_punctuation, strip_numeric,strip_short
import pandas as pd
import unidecode
import csv
import datetime as dt

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt

In [25]:
filename = "/Users/anastasyatoropova/Downloads/amazon_reviews_us_Beauty_v1_00.tsv"
url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz'

In [72]:
fdf = pd.read_csv(filename, delimiter='\t', names=a[0], skiprows=[0])
mask = fdf.isnull().sum(axis=1) != 7
fdf = fdf[mask]
fdf["review_date"] = pd.to_datetime(fdf['review_date'],format = "%Y-%m-%d",errors='coerce')
fdf.dropna(inplace=True)
fdf["year"] = fdf["review_date"].map(lambda x: x.year)

  interactivity=interactivity, compiler=compiler, result=result)


In [78]:
year_count = fdf.groupby('year')['review_id'].nunique()

In [79]:
year_count

year
2000         33
2001        257
2002        567
2003       1252
2004       3604
2005       7889
2006      11601
2007      28486
2008      39393
2009      57243
2010      95308
2011     180718
2012     342130
2013     919622
2014    1604463
2015    1801244
Name: review_id, dtype: int64

In [86]:
dft1 = fdf[fdf["year"] <= 2005]
dft2 = fdf[(fdf["year"] > 2005) & (fdf["year"] <= 2009)]
dft3 = fdf[(fdf["year"] > 2009) & (fdf["year"] <= 2012)]
dft4 = fdf[fdf["year"] > 2012]

In [88]:
def preprocess(doc):
    return(strip_short(remove_stopwords(strip_numeric(strip_punctuation(doc.lower()))),3).split())

def lda_modeling_df(df):
    tags = [tag for tag in df["review_body"]]
    corpus = [preprocess(tag) for tag in tags]
    dictionary = corpora.Dictionary(corpus)
    corpus = [dictionary.doc2bow(preprocess(tag)) for tag in tags]
    dictionary.filter_extremes(no_below=2, no_above=0.99)
    corpus = [dictionary.doc2bow(preprocess(tag)) for tag in tags]

    lda_model = LdaModel(corpus=corpus,  # This code runs your lda
                             id2word=dictionary, 
                             random_state=100, 
                             num_topics=15,
                             passes=5,
                             chunksize=10000,
                             alpha='asymmetric',
                             decay=0.5,
                             offset=64,
                             eta=None,
                             eval_every=0,
                             iterations=100,
                             gamma_threshold=0.001,
                             per_word_topics=True)
    return lda_model, corpus, dictionary

def print_lda_models(lda_model, dictionary):
    for i in range(15):
        words = lda_model.get_topic_terms(i, topn=10)
        print("Topic : " + str(i))
        for i in words:
            print("Word: " + str(dictionary[i[0]]) + "\t\t Weight: " + str(i[1])) 
        print("\n")

In [89]:
lda_model, corpus, dictionary = lda_modeling_df(dft1)
print_lda_models(lda_model, dictionary)

lda_model.log_perplexity(corpus)
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.show(vis)

Topic : 0
Word: hair		 Weight: 0.04600167
Word: product		 Weight: 0.013497666
Word: use		 Weight: 0.011438107
Word: time		 Weight: 0.00917726
Word: like		 Weight: 0.008526855
Word: great		 Weight: 0.007506492
Word: dryer		 Weight: 0.0062313285
Word: good		 Weight: 0.005978945
Word: bought		 Weight: 0.0054621943
Word: little		 Weight: 0.0050263167


Topic : 1
Word: like		 Weight: 0.016351992
Word: product		 Weight: 0.011603008
Word: great		 Weight: 0.00967996
Word: use		 Weight: 0.008997925
Word: love		 Weight: 0.008875364
Word: scent		 Weight: 0.008587302
Word: smell		 Weight: 0.0077008894
Word: hair		 Weight: 0.0070929807
Word: good		 Weight: 0.0062495875
Word: smells		 Weight: 0.0055355644


Topic : 2
Word: use		 Weight: 0.0048875855
Word: product		 Weight: 0.00443248
Word: hair		 Weight: 0.0040734205
Word: skin		 Weight: 0.0037887916
Word: like		 Weight: 0.0036230092
Word: clean		 Weight: 0.002614089
Word: great		 Weight: 0.002436231
Word: razor		 Weight: 0.0023274005
Word: new		 We

In [None]:
lda_model.log_perplexity(corpus)
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.show(vis)

In [None]:
lda_model2, corpus2, dictionary2 = lda_modeling_df(dft2)
print_lda_models(lda_model2, dictionary2)

lda_model2.log_perplexity(corpus2)
vis2 = pyLDAvis.gensim.prepare(lda_model2, corpus2, dictionary2)
pyLDAvis.show(vis2)

In [None]:
lda_model3, corpus3, dictionary3 = lda_modeling_df(dft3)
print_lda_models(lda_model3, dictionary3)

lda_model3.log_perplexity(corpus3)
vis3 = pyLDAvis.gensim.prepare(lda_model3, corpus3, dictionary3)
pyLDAvis.show(vis3)

In [None]:
lda_model4, corpus4, dictionary4 = lda_modeling_df(dft4)
print_lda_models(lda_model4, dictionary4)

lda_model4.log_perplexity(corpus4)
vis4 = pyLDAvis.gensim.prepare(lda_model4, corpus4, dictionary4)
pyLDAvis.show(vis4)