In [None]:
import matplotlib.pyplot as plt
import gensim
import numpy as np
import spacy
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
import pyLDAvis.gensim
import os, re, operator, warnings
warnings.filterwarnings('ignore') # Let's not pay heed to them right now
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np

In [None]:
path = 'yelp_data/health_text_sentiment_full.csv'

In [None]:
df = pd.read_csv(path)
df.drop(['Unnamed: 0'],axis=1,inplace=True)
df = df[['stars','clean_text']]
#df = df[['stars','text','clean_text']]

In [None]:
df.head()

##  five stars

In [None]:
one_star = df.ix[np.where(df.stars==1)]
one_star.reset_index(drop=True,inplace=True)
five_star = df.ix[np.where(df.stars==5)]
five_star.reset_index(drop=True,inplace=True)

In [None]:
five_star.head()

## NLP

In [None]:
#nlp = spacy.load('en', parser=False, entity=False)
nlp = spacy.load('en')

In [None]:
## test
test = one_star.loc[:100]
test.head()

In [None]:
%%time
tokens = []
lemma = []
pos = []
for doc in nlp.pipe(test['clean_text'].astype('unicode').values, batch_size=50, n_threads=3):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)

test['tokens'] = tokens
test['lemma'] = lemma
test['POS'] = pos

In [None]:
test.head()

## Five Star

In [None]:
five_star.info()

In [None]:
five_star.head()

## time

```
CPU times: user 59min 24s, sys: 6min 32s, total: 1h 5min 56s
Wall time: 8min 42s
    ```

In [None]:
%%time
lemma = []
for doc in nlp.pipe(five_star['clean_text'].astype('unicode').values, batch_size=50, n_threads=3):
    if doc.is_parsed:
        lemma.append([n.lemma_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        lemma.append(None)

five_star['lemma'] = lemma

In [None]:
fivestar_lemma = list(five_star.lemma)

## Bigram

In [None]:
%%time
bigram = gensim.models.Phrases(fivestar_lemma)

In [None]:
%%time
texts = [bigram[line] for line in fivestar_lemma]

In [None]:
texts[10]

In [None]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
len(corpus)

In [None]:
corpus[10]

## LSI

In [None]:
lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [None]:
lsimodel.show_topics(num_topics=5)

## HDP

In [None]:
%%time
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

In [None]:
 hdpmodel.show_topics()

## LDA

In [None]:
%%time
ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [None]:
lda_topics = ldamodel.show_topics(num_topics=5)

In [None]:
for i,j in lda_topics:
    print('TOPIC:{}\n\nresult:{}\n'.format(i,j))


In [None]:
from gensim.test.utils import datapath
# Save model to disk.
ldamodel.save('fivestar_lda.model')

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

## Topic coherence

In [None]:
%%time
lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]
hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)]
ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)]

In [None]:
%%time
lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=texts, dictionary=dictionary, window_size=10).get_coherence()
hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=texts, dictionary=dictionary, window_size=10).get_coherence()
lda_coherence = CoherenceModel(topics=ldatopics, texts=texts, dictionary=dictionary, window_size=10).get_coherence()

In [None]:
def evaluate_bar_graph(coherences, indices):
    """
    Function to plot bar graph.
    coherences: list of coherence values
    indices: Indices to be used to mark bars. Length of this and coherences should be equal.
    """
    assert len(coherences) == len(indices)
    n = len(coherences)
    x = np.arange(n)
    plt.bar(x, coherences, width=0.2, tick_label=indices, align='center')
    plt.xlabel('Models')
    plt.ylabel('Coherence Value')

In [None]:
evaluate_bar_graph([lsi_coherence, hdp_coherence, lda_coherence],
['LSI', 'HDP', 'LDA'])

In [None]:
def evaluate_graph(dictionary, corpus, texts, limit):
    """
    Function to display num_topics - LDA graph using c_v coherence
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : topic limit
    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    c_v = []
    lm_list = []
    for num_topics in range(1, limit):
        lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        lm_list.append(lm)
        cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
        c_v.append(cm.get_coherence())
    # Show graph
    x = range(1, limit)
    plt.plot(x, c_v)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    plt.show()
    
    return lm_list, c_v

In [None]:
%%time
lmlist, c_v = evaluate_graph(dictionary=dictionary, corpus=corpus, texts=texts, limit=10)