# Special Topics: Natural Language Processing
## Assignment #1: Latent Dirichlet Allocation Implementation

### Imports

In [2]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import gensim.corpora as corpora
from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import simple_preprocess, tokenize
from gensim.models import ldamodel, CoherenceModel
from gensim.matutils import kullback_leibler
from statistics import mean

import spacy
import re

import pyLDAvis
import pyLDAvis.gensim_models as gensim_vis
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

### Load the data, then create the corpus

In [3]:
reviews_df = pd.DataFrame(pd.read_csv("Review.csv"))
reviews_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Num_words_text
0,226241,B002T0NWKE,A1G25CG7UWQ3XO,&#10003;Onoñda&#660;gegá&#660;,1,5,1,1309305600,Misleading discription-bad taste,they advertise that they contain aspartame die...,43
1,194272,B0009Y8AGI,A34Y9SS1F0H485,Bluemoon,0,6,1,1263600000,Yuck!,taste disgusting having struggle just keep dow...,27
2,359016,B003CIBPN8,ANDTRFIVWED40,R. Hauff,6,8,1,1275782400,"If you liked it in tetrapak box, don't buy now",bought this zico water hoping have good taste ...,53
3,402750,B000EQX6HO,A2W95LYEAIRDAC,"I. B. Lehman ""IBL""",1,1,1,1316044800,Great product...just not in small packages!,love terra chipswhen they bags from storei ord...,49
4,429211,B0006VM0LU,A32T7H92YWC0A2,Molly F,0,0,1,1317340800,Unfortunately unsalvageable,wanted desperately like this sauce smells tast...,35


In [4]:
reviews_text = list(reviews_df.Text)
reviews_text[3]

'love terra chipswhen they bags from storei ordered these expecting same quality wasnt chips these small bags which were perfect lunch were stale tasteless some chips were stale comfortably chew dont know they sitting around long warehouse somewhere they dont well small packages dont order these chips like this'

### 1. Implementation of LDA:

Class LDA:

- **__init___**
    - self.corpus: list of documents
    - self.total_words: amount of words in corpus
    - self.alphas: list of possible alpha values
    - self.etas: list of possible eta values
    - self.number_of_topics: list of possible number of topics

- **remove_stop**
    - removes stop words from docs
    - return: docs

- **preprocess**
    - Removes stop words, Lemmatizes, and creates Document Term Matrix
    - return: doc_term_matrix, id2word, no_stops

- **run_model**
    - corpus: list of documents
    - id2word: mapping between words and their ids
    - base_model: bool 
    - optimize_topics: bool
    - optimize_etas_alphas: boll
    - alpha: chosen alpha
    - eta: chosen eta
    - topics: None
    - return: lda model, corpus, coherence_scores_topics, coherence_scores_alphas_etas
- **visualize**
    - model: lda model
    - corpus: list of documents
    - id2word: mapping between words and their ids
    - return: charts
- **kl_divergence**
    - model: lda model
    - topic_number: number of topics
    - return: kl_dict

In [5]:
class LDA:
    
    def __init__(self, corpus):
        self.corpus = corpus 
        self.total_words = sum([len(size) for size in self.corpus])
        self.alphas = [0.05, 0.10, 0.15, 0.20, 0.25]
        self.etas = [0.05, 0.10, 0.15, 0.20, 0.25]
        self.number_of_topics = [5, 10, 15, 25, 50]

    def remove_stop(self):
        docs = list()
        for doc in self.corpus:
            docs.append(remove_stopwords(doc))

        return docs

    def preprocess(self):
        # Also preprocess in addition to lemmatize    
        # Remove stop words
        no_stops = self.remove_stop()

        changed = 0 
        same = 0

        # Lemmatize
        for word, doc in enumerate(no_stops):
            no_stops[word] = doc.split()

            doc = ' '.join(no_stops[word])
            
            tokenized = simple_preprocess(doc)

            lemmatizer = WordNetLemmatizer()

            for token, sentence in enumerate(tokenized):
                word = lemmatizer.lemmatize(tokenized[token])

                if tokenized[token] != word:
                    changed+=1
                else:
                    same+=1

        removed_words = sum([len(size) for size in no_stops])

        print(f'Percentage of words changed and removed: ' + '{:.2%}'.format(removed_words/self.total_words))
        print(f'Percentage of words that did not change: ' + '{:.2%}'.format((self.total_words - removed_words) / self.total_words) + '\n')
        print('Creating document term matrix...' + '\n')

        # Create document term matrix
        id2word = corpora.Dictionary(no_stops)
        doc_term_matrix = [id2word.doc2bow(doc) for doc in no_stops]
        print(f'The document term matrix looks like: ' + '\n', doc_term_matrix[0:3], '\n')

        return doc_term_matrix, id2word, no_stops
        
    def run_model(self, corpus, id2word, base_model, optimize_topics, 
                            optimize_etas_alphas, alpha, eta, topics = None, texts = None):
        
        coherence_scores_topics = {}
        coherence_scores_alphas_etas = {}
        coherence_scores_alphas_etas['alpha'] = []
        coherence_scores_alphas_etas['eta'] = []
        coherence_scores_alphas_etas['coherence_score'] = []
        
        if base_model:
            topics = topics
            print(f'Running LDA model with alpha = {alpha}, eta = {eta}, n_topics = {topics}...' + '\n')
            model = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=topics, random_state=100,
                        update_every=1, chunksize=100, passes=10, alpha=alpha, eta=eta, per_word_topics=True)
            for index, topic in model.show_topics(num_topics=3, num_words=30, formatted=False):
                print('Topic: {} \nWords: {}'.format(index, [w[0] for w in topic]))
        print('\n')

        if optimize_topics:
            models = []
            corpuses = []
            for topic_number in self.number_of_topics:
                print(f'Running LDA model with alpha = {alpha}, eta = {eta}, n_topics = {topic_number}...' + '\n')
                model = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=topic_number, random_state=100,
                        update_every=1, chunksize=100, passes=10, alpha=alpha, eta=eta, per_word_topics=True)
                models.append(model)
                corpuses.append(corpus)
                
                coherence_score = CoherenceModel(model=model, texts=texts, corpus=corpus, 
                                                 dictionary=id2word, coherence='c_v').get_coherence()
                coherence_scores_topics[f'{topic_number}'] = coherence_score
                max_topic = max(coherence_scores_topics, key=coherence_scores_topics.get)
                
            model = models
            corpus = corpuses

        if optimize_etas_alphas:
            models = []
            corpuses = []
            for a in self.alphas:
                for e in self.etas:
                    print(f'Running LDA model with alpha = {a}, eta = {e}, n_topics = {topics}...' + '\n')
                    model = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=topics, random_state=100,
                        update_every=1, chunksize=100, passes=10, alpha=a, eta=e, per_word_topics=True)
                    models.append(model)
                    corpuses.append(corpus)

                    coherence_score = CoherenceModel(model=model, texts=texts, corpus=corpus, 
                                                     dictionary=id2word, coherence='c_v').get_coherence()
            
                        
                    coherence_scores_alphas_etas['alpha'].append(a)
                    coherence_scores_alphas_etas['eta'].append(e)
                    coherence_scores_alphas_etas['coherence_score'].append(coherence_score)
                    
            model = models
            corpus = corpuses    

        print('Finished!')  
        return model, corpus, coherence_scores_topics, coherence_scores_alphas_etas

    def visualize(self, model, corpus, id2word):
        pyLDAvis.enable_notebook()
        charts = gensim_vis.prepare(model, corpus, id2word)
        return charts
    

    # Topic number is on a scale of 0-4
    # Model is on a scale of 0-24
    def kl_divergence(self, model, topic_number):
        topics = [dist for dist in model.get_topics()]
        kl_dict = {}

        for number in range(topic_number):
            for number_2 in range(topic_number):
                # Take the mean of both KL-Divergence calculations due to asymmetry
                if number == number_2:
                    continue

                topics[number] = list(topics[number])
                topics[number_2] = list(topics[number_2])
                kl_value = (float(kullback_leibler(topics[number], topics[number_2])) 
                                + float(kullback_leibler(topics[number_2], topics[number]))) / 2
                
                if f'Topic {number_2}, Topic {number}' in kl_dict.keys():
                    continue
                
                kl_dict[f'Topic {number}, Topic {number_2}'] = kl_value
                print(f'KL-Divergence for Topic {number+1}, Topic {number_2+1}: ', kl_value)
        
        return kl_dict

        

### Implement and Visualize LDA
#### Step 1. Run LDA with ***apha*** = 0.05, ***eta*** = 0.05, ***no_topics*** = 10
#### Step 2. Print top 30 high-probability words for each topic
#### Step 3. Visualize Results

In [6]:
lda = LDA(reviews_text)

In [7]:
term_matrix, id2word, texts = lda.preprocess()

Percentage of words changed and removed: 10.30%
Percentage of words that did not change: 89.70%

Creating document term matrix...

The document term matrix looks like: 
 [[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 3), (9, 1), (10, 1), (11, 1), (12, 3), (13, 1), (14, 1), (15, 1), (16, 2), (17, 1), (18, 2), (19, 1), (20, 1), (21, 2), (22, 1), (23, 1)], [(23, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1)], [(5, 1), (16, 1), (23, 3), (32, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 3), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 2), (60, 1), (61, 3), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1)]] 



In [52]:
model, corpus, coherence_scores_topics, \
        coherence_scores_alphas_etas = lda.run_model(term_matrix, id2word, 
                                base_model = True, optimize_topics = False, 
                                optimize_etas_alphas = False, alpha = 0.05, eta = 0.05, topics = 10)
lda.visualize(model, corpus, id2word)

Running LDA model with alpha = 0.05, eta = 0.05, n_topics = 10...

Topic: 0 
Words: ['cookies', 'delicious', 'cereal', 'health', 'large', 'boxes', 'fast', 'value', 'alternative', 'compared', 'reason', 'brewed', 'pure', 'pods', 'opinion', 'types', 'ground', 'important', 'ounces', 'scent', 'following', 'amounts', 'email', 'consistently', 'earth', 'business', 'trust', 'method', 'crap', 'suggest']
Topic: 6 
Words: ['bags', 'makes', 'months', 'home', 'especially', 'family', 'daughter', 'difference', 'soup', 'weeks', 'fresh', 'market', 'popcorn', 'weve', 'problems', 'cocoa', 'supermarket', 'exactly', 'world', 'sale', 'varieties', 'best', 'ginger', 'near', 'seller', 'crisp', 'sell', 'couple', 'ship', 'excited']
Topic: 7 
Words: ['chips', 'fresh', 'tasty', 'vanilla', 'kids', 'cream', 'youre', 'potato', 'plain', 'chip', 'finally', 'taking', 'baked', 'ones', 'leaves', 'mango', 'homemade', 'sour', 'wants', 'pouches', 'acid', 'bunch', 'lays', 'eaten', 'wasnt', 'curious', 'dirt', 'stale', 'theyre',

### C. Vary the number of topics [5, 10, 15, 25, 50]

In [None]:
model, corpus, coherence_scores_topics, \
        coherence_scores_alphas_etas = lda.run_model(term_matrix, id2word, 
                                base_model = False, optimize_topics = True, 
                                optimize_etas_alphas = False, alpha = 0.05, eta = 0.05, topics = None, texts=texts)



Running LDA model with alpha = 0.05, eta = 0.05, n_topics = 5...



In [None]:
plt.figure(figsize=(20,10))
plt.plot(list(coherence_scores_topics.keys()), list(coherence_scores_topics.values()))
plt.suptitle('Coherence Score vs Number of Topics', fontweight='bold')
plt.ylabel('Coherence (c_v)')
plt.xlabel('Number of Topics')

### Visualize 5 topics

In [None]:
lda.visualize(model[0], corpus[0], id2word)

### Visualize 25 topics

In [None]:
lda.visualize(model[3], corpus[3], id2word)

#### Report the optimum number of topics

The graphs above show that as the number of topics decrease, there tends to be more overlap between the topics. Therefore, we can chose 5 topics to optimize the model. The topics are interpretable and have unique terms. Each topic is in some way related to the other topics, which can give us a sense that the topics are exhaustive across the documents.

### C. Vary the number of ***alpha*** and ***eta*** [0.05, 0.10, 0.15, 0.20, 0.25]
### Run LDA with ***n_topics*** = 5 (optimum number of topics)

### Visualize ***alpha*** = 0.05 and ***eta*** = 0.05

In [8]:
model, corpus, coherence_scores_topics, \
        coherence_scores_alphas_etas = lda.run_model(term_matrix, id2word, 
                                base_model = False, optimize_topics = False, 
                                optimize_etas_alphas = True, alpha = None, eta = None, topics = 5, texts=texts)



Running LDA model with alpha = 0.05, eta = 0.05, n_topics = 5...

Running LDA model with alpha = 0.05, eta = 0.1, n_topics = 5...

Running LDA model with alpha = 0.05, eta = 0.15, n_topics = 5...

Running LDA model with alpha = 0.05, eta = 0.2, n_topics = 5...

Running LDA model with alpha = 0.05, eta = 0.25, n_topics = 5...

Running LDA model with alpha = 0.1, eta = 0.05, n_topics = 5...

Running LDA model with alpha = 0.1, eta = 0.1, n_topics = 5...

Running LDA model with alpha = 0.1, eta = 0.15, n_topics = 5...

Running LDA model with alpha = 0.1, eta = 0.2, n_topics = 5...

Running LDA model with alpha = 0.1, eta = 0.25, n_topics = 5...

Running LDA model with alpha = 0.15, eta = 0.05, n_topics = 5...

Running LDA model with alpha = 0.15, eta = 0.1, n_topics = 5...

Running LDA model with alpha = 0.15, eta = 0.15, n_topics = 5...

Running LDA model with alpha = 0.15, eta = 0.2, n_topics = 5...

Running LDA model with alpha = 0.15, eta = 0.25, n_topics = 5...

Running LDA model w

In [41]:
scores_df = pd.DataFrame.from_dict(coherence_scores_alphas_etas)
scores_df.head()

Unnamed: 0,alpha,eta,coherence_score
0,0.05,0.05,0.306098
1,0.05,0.1,0.340261
2,0.05,0.15,0.344007
3,0.05,0.2,0.337935
4,0.05,0.25,0.332193


In [42]:
print('Best Coherence Score')
print(scores_df.max(), '\n')

print('Worst Coherence Score')
print(scores_df.min())

Best Coherence Score
alpha              0.250000
eta                0.250000
coherence_score    0.384109
dtype: float64 

Worst Coherence Score
alpha              0.050000
eta                0.050000
coherence_score    0.306098
dtype: float64


### Run the new model for optimal parameters
### **Implement KL-Divergence**

In [45]:
kl_div = lda.kl_divergence(model[24], 5)

KL-Divergence for Topic 1, Topic 2:  4.329992294311523
KL-Divergence for Topic 1, Topic 3:  3.9204466342926025
KL-Divergence for Topic 1, Topic 4:  3.7027523517608643
KL-Divergence for Topic 1, Topic 5:  4.0258400440216064
KL-Divergence for Topic 2, Topic 3:  3.848941445350647
KL-Divergence for Topic 2, Topic 4:  3.7084317207336426
KL-Divergence for Topic 2, Topic 5:  3.48709774017334
KL-Divergence for Topic 3, Topic 4:  4.168710350990295
KL-Divergence for Topic 3, Topic 5:  4.089065790176392
KL-Divergence for Topic 4, Topic 5:  3.624272108078003


### Visualize ***alpha*** = 0.25 and ***eta*** = 0.25

In [46]:
lda.visualize(model[24], corpus[24], id2word)

### Visualize ***alpha*** = 0.05 and ***eta*** = 0.05

In [47]:
kl_div = lda.kl_divergence(model[0], 5)

KL-Divergence for Topic 1, Topic 2:  7.795775413513184
KL-Divergence for Topic 1, Topic 3:  7.092055082321167
KL-Divergence for Topic 1, Topic 4:  6.449020862579346
KL-Divergence for Topic 1, Topic 5:  6.929001808166504
KL-Divergence for Topic 2, Topic 3:  5.81796407699585
KL-Divergence for Topic 2, Topic 4:  7.610763311386108
KL-Divergence for Topic 2, Topic 5:  6.8870909214019775
KL-Divergence for Topic 3, Topic 4:  7.038165330886841
KL-Divergence for Topic 3, Topic 5:  7.401114463806152
KL-Divergence for Topic 4, Topic 5:  7.053380727767944


In [48]:
lda.visualize(model[0], corpus[0], id2word)