<img align="left" src="https://lever-client-logos.s3.amazonaws.com/864372b1-534c-480e-acd5-9711f850815c-1524247202159.png" width=200>
<br></br>
<br></br>

# Topic Modeling
## *Data Science Unit 4 Sprint 1 Assignment 4*

Analyze a corpus of Amazon reviews from Unit 4 Sprint 1 Module 1's lecture using topic modeling: 

- Fit a Gensim LDA topic model on Amazon Reviews
- Select appropriate number of topics
- Create some dope visualization of the topics
- Write a few bullets on your findings in markdown at the end
- **Note**: You don't *have* to use generators for this assignment

In [3]:
import pandas as pd

# Read in the Amazon data into a dataframe
df_amzn = pd.read_csv("data/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv")

In [6]:
# df_amzn.head()

In [7]:
# Imports
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danoand/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
# Gensim and tools imports
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [11]:
# Set up stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [15]:
df_amzn.columns

Index(['id', 'dateAdded', 'dateUpdated', 'name', 'asins', 'brand',
       'categories', 'primaryCategories', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'reviews.date', 'reviews.dateSeen',
       'reviews.didPurchase', 'reviews.doRecommend', 'reviews.id',
       'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs',
       'reviews.text', 'reviews.title', 'reviews.username', 'sourceURLs'],
      dtype='object')

In [38]:
# Create a working dataframe 
df_amzn_wrk = df_amzn.copy()

In [39]:
# Tokenize the review text documents

# Import spacy module
import spacy
# Create a spacy object
nlp = spacy.load('en_core_web_lg')

# tknz_strings tokenizes a string passed to the function
def tknz_strings(strng):
    # Generate tokens - including removal of punctuation
    return gensim.utils.simple_preprocess(strng, deacc=True)

# rmv_stopwords removes stopwords from a list of tokens
def rmv_stopwords(lst_tkns):
    return [wrd for wrd in lst_tkns if wrd not in stop_words]

# mke_lemmanade lemmatizes the passed tokens
def mke_lemmanade(lst_tkns, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    ret_tkns = []
    
    # Ingest the token list via spacy
    tmp_doc = nlp(" ".join(lst_tkns))
    
    # From the spacy output, grab the lemmatized token objects
    for nlp_tkn in tmp_doc:
        # Ignore the token if it's not a desired part of speech
        if nlp_tkn.pos_ not in allowed_postags:
            continue
            
        # Include this lemmatized token
        ret_tkns.append(nlp_tkn.lemma_)
        
    # Post iteration: return our list of lemmatized tokens
    return ret_tkns      

In [40]:
# Generate initial tokens from the review text
df_amzn_wrk['upd_tokens'] = df_amzn_wrk['reviews.text'].apply(tknz_strings)

In [41]:
# Remove stopwords from each document token list
df_amzn_wrk['upd_tokens'] = df_amzn_wrk['upd_tokens'].apply(rmv_stopwords)

In [42]:
# Lemmatize each document token list
df_amzn_wrk['upd_tokens'] = df_amzn_wrk['upd_tokens'].apply(mke_lemmanade)

In [44]:
df_amzn_wrk['upd_tokens'].sample(10)

8068     [cheap, battery, poor, performance, put, remot...
6732                            [hold, charge, top, brand]
9360                  [great, battery, life, great, price]
13294    [wife, enjoy, much, buy, second, first, reader...
10190                               [good, battery, price]
25884    [detail, item, little, sparse, thought, would,...
16740    [pick, couple, hour, ago, screen, good, tablet...
2377                                        [great, price]
21014              [good, tablet, regulation, little, low]
25052    [really, like, idea, tablet, affordable, niece...
Name: upd_tokens, dtype: object

In [45]:
# Create an normalized word (token) to integer id (LDA Dictionary)

# Create a list of review tokens represented in our work dataframe
upd_tokens_list = df_amzn_wrk['upd_tokens'].tolist()

# Generate the dictionary
id2word = corpora.Dictionary(upd_tokens_list)

In [52]:
# Map a term id to term frequency corpus
corpus = [id2word.doc2bow(txt) for txt in upd_tokens_list]

In [55]:
# Build an LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5,
                                           random_state=42,
                                           update_every=1,
                                           chunksize=50,
                                           passes=8,
                                           alpha='auto',
                                           per_word_topics=True)

In [56]:
# Visualize topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

## Notes

* The choice of 5 topics looks fairly decent
* With one or two exceptions, the top 30 terms in each topic are almost exclusive to their assigned topic

## Stretch Goals

* Incorporate Named Entity Recognition in your analysis
* Incorporate some custom pre-processing from our previous lessons (like spacy lemmatization)
* Analyze a dataset of interest to you with topic modeling