In [1]:
import gensim
from gensim import corpora,models
from gensim.models import LdaModel
from gensim.parsing.preprocessing import remove_stopwords,strip_punctuation, strip_numeric,strip_short
import pandas as pd
import unidecode
import csv
import datetime as dt

import pyLDAvis
import pyLDAvis.gensim  # don't skip this

import matplotlib.pyplot as plt

In [2]:
filename = "amazon_reviews_us_Beauty_v1_00.tsv"
url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz'

a = [] 
with open(filename) as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t' )
    for row in reader:
        a.append(row)
        break

In [None]:
fdf = pd.read_csv(filename, delimiter='\t', names=a[0], skiprows=[0])
mask = fdf.isnull().sum(axis=1) != 7
fdf = fdf[mask]
fdf["review_date"] = pd.to_datetime(fdf['review_date'],format = "%Y-%m-%d",errors='coerce')
fdf.dropna(inplace=True)
fdf["year"] = fdf["review_date"].map(lambda x: x.year)

In [None]:
year_count = fdf.groupby('year')['review_id'].nunique()

In [None]:
year_count = year_count.to_dict()
year_count

In [None]:
#Partition logic
#30000 random entries per year

years = list(year_count.keys())
years.remove(2000)
df_list = []
thresh = 30000
for year in years:
    temp_df = fdf[fdf["year"] == year]
    if (year_count[year] > thresh):
        temp_df = temp_df.sample(thresh)
    df_list.append(temp_df)    


concat_dfs = []
for i in range(5):
    concat_dfs.append(pd.concat(df_list[3 * i: 3 * i + 3]))

In [None]:
def preprocess(doc):
    return(strip_short(remove_stopwords(strip_numeric(strip_punctuation(doc.lower()))),3).split())

def lda_modeling_df(df):
    tags = [tag for tag in df["review_body"]]
    corpus = [preprocess(tag) for tag in tags]
    dictionary = corpora.Dictionary(corpus)
    corpus = [dictionary.doc2bow(preprocess(tag)) for tag in tags]
    dictionary.filter_extremes(no_below=2, no_above=0.8)
    corpus = [dictionary.doc2bow(preprocess(tag)) for tag in tags]

    lda_model = LdaModel(corpus=corpus,  # This code runs your lda
                             id2word=dictionary, 
                             random_state=100, 
                             num_topics=15,
                             passes=5,
                             chunksize=10000,
                             alpha='asymmetric',
                             decay=0.5,
                             offset=64,
                             eta=None,
                             eval_every=0,
                             iterations=100,
                             gamma_threshold=0.001,
                             per_word_topics=True)
    return lda_model, corpus, dictionary

def print_lda_models(lda_model, dictionary):
    for i in range(15):
        words = lda_model.get_topic_terms(i, topn=10)
        print("Topic : " + str(i))
        for i in words:
            print("Word: " + str(dictionary[i[0]]) + "\t\t Weight: " + str(i[1])) 
        print("\n")

In [None]:
lda_model, corpus, dictionary = lda_modeling_df(concat_dfs[0])
print_lda_models(lda_model, dictionary)

lda_model.log_perplexity(corpus)
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

In [None]:
lda_model2, corpus2, dictionary2 = lda_modeling_df(concat_dfs[1])
print_lda_models(lda_model2, dictionary2)

lda_model2.log_perplexity(corpus2)
vis2 = pyLDAvis.gensim.prepare(lda_model2, corpus2, dictionary2)
pyLDAvis.display(vis2)

In [None]:
lda_model3, corpus3, dictionary3 = lda_modeling_df(concat_dfs[2])
print_lda_models(lda_model3, dictionary3)

lda_model3.log_perplexity(corpus3)
vis3 = pyLDAvis.gensim.prepare(lda_model3, corpus3, dictionary3)
pyLDAvis.display(vis3)

In [None]:
lda_model4, corpus4, dictionary4 = lda_modeling_df(concat_dfs[3])
print_lda_models(lda_model4, dictionary4)

lda_model4.log_perplexity(corpus4)
vis4 = pyLDAvis.gensim.prepare(lda_model4, corpus4, dictionary4)
pyLDAvis.display(vis4)

In [None]:
lda_model5, corpus5, dictionary5 = lda_modeling_df(concat_dfs[4])
print_lda_models(lda_model5, dictionary5)

lda_model5.log_perplexity(corpus5)
vis5 = pyLDAvis.gensim.prepare(lda_model5, corpus5, dictionary5)
pyLDAvis.display(vis5)

In [None]:
flda_model, fcorpus, fdictionary = lda_modeling_df(pd.concat(concat_dfs))
print_lda_models(flda_model, fdictionary)

flda_model.log_perplexity(fcorpus)
fvis = pyLDAvis.gensim.prepare(flda_model, fcorpus, fdictionary)
pyLDAvis.display(fvis)