In [4]:
import pandas as pd
import pickle
import numpy as np

# Load the bar review dataset 
review = pd.read_pickle('../output/bar_reviews_cleaned_and_tokenized.pickle')


In [5]:
from itertools import chain

docs = [" ".join(list(chain.from_iterable(l))) for l in review.cleaned_tokenized.iloc[:]]

print(docs[0])
print(docs[1])


food great best thing wing wing simply fantastic wet cajun best most popular also like seasoned salt wing wing-night monday wednesday night 075 whole wing dining area nice very family friendly bar very nice well place truly yinzers dream pittsburgh dad would love place nat
checked place past monday wing-night heard wing great decided finally time check wing whole wing crispy nice change pace got wet cajun sauce garlic butter wing cajun not bold enough flavor sauce thin sauce also thin garlic butter expected better average dont like seeing sauce resting bottom boat would definitely come try place sample other item menu probably not become regular stop wing anytime-soon


In [6]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()




# LDA

In [7]:

n_samples = 1000
n_features = 5000
n_topics = 20
n_top_words = 10

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.75, min_df=2, max_features=n_features)
t0 = time()
tf = tf_vectorizer.fit_transform(docs[:n_samples])
print("done in %0.3fs." % (time() - t0))


print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=10.,
                                random_state=0, n_jobs=6)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

Extracting tf features for LDA...
done in 0.077s.
Fitting LDA models with tf features, n_samples=1000 and n_features=5000...
done in 2.394s.


In [8]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 20)

#lda.



Topics in LDA model:
Topic #0:
sing factory alexions request musical lyric 19 send peaceful laugh please wowed primantis oldest once closer union talent pleased recall
Topic #1:
aint humor rankin rick hosted requested worse search emils beer sing great good place awesome shot play something know buck
Topic #2:
daughter tea ice mango slider cool father strip mashed lobster kitchen dude enchilada episode kobe repeatedly bash semi piano player
Topic #3:
repeating quote driver bachelorette ashley performer mirror theme joke blast butt bitch pianist funny age wrote absolute acoustic talented ran
Topic #4:
sing song piano rock brewery bottom cab park dueling quarter band north he drum hit homestead shout arcade drive largest
Topic #5:
place drink food time to great back not get of night one go no good bar went like service little
Topic #6:
trip salad meat bar reuben kind of one dinner chicken basic goodness not first back run piece sitting literally in
Topic #7:
horrible fish another da goo

In [10]:
import pyLDAvis
import pandas as pd
import funcy as fp
from pyLDAvis import prepare as vis_prepare

def _extract_data(docs, vect, lda):
    #LDA scikit-learn implementation seems to have buggy code.
    #Topic_term_dists and doc_topic_dists isn't accummulated to 1.
    #Hence norm function implemented to normalize the distributions.
    norm = lambda data: pd.DataFrame(data).div(data.sum(1),axis=0).values
    vected = vect.fit_transform(docs)
    doc_topic_dists = norm(lda.fit_transform(vected))
    
    return lda,vect, dict(
                      doc_lengths = docs.str.len(),
                      vocab = vect.get_feature_names(),
                      term_frequency = vected.sum(axis=0).tolist()[0],
                      topic_term_dists = norm(lda.components_),
                      doc_topic_dists = doc_topic_dists)

def prepare(docs, vect, lda, **kwargs):
    """Create Prepared Data from sklearn's vectorizer and Latent Dirichlet
    Application.

    Parameters
    ----------
    docs : Pandas Series.
        Documents to be passed as an input.
    vect : Scikit-Learn Vectorizer (CountVectorizer,TfIdfVectorizer).
        vectorizer to convert documents into matrix sparser
    lda  : sklearn.decomposition.LatentDirichletAllocation.
        Latent Dirichlet Allocation

    **kwargs: Keyword argument to be passed to pyLDAvis.prepare()


    Returns
    -------
    prepared_data : PreparedData
          the data structures used in the visualization


    Example
    --------
    For example usage please see this notebook:
    http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/sklearn.ipynb

    See
    ------
    See `pyLDAvis.prepare` for **kwargs.
    """
    
    opts = fp.merge(_extract_data(docs, vect, lda)[2], kwargs)

    return vis_prepare(**opts)

vis_data = prepare(docs, tf_vectorizer, lda)





AttributeError: 'list' object has no attribute 'str'

# NMF

In [15]:
from sklearn.decomposition import NMF


n_samples = 100000
n_features = 10000
n_topics = 6
n_top_words = 20


# # Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, max_features=n_features)


t0 = time()
tfidf = tfidf_vectorizer.fit_transform(review_flatten[:n_samples])
print("done in %0.3fs." % (time() - t0))


# Fit the NMF model
print("Fitting the NMF model with tf-idf features,"
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Extracting tf-idf features for NMF...
done in 1.158s.
Fitting the NMF model with tf-idf features,n_samples=100000 and n_features=10000...
done in 3.671s.

Topics in NMF model:
Topic #0:
great food atmosphere beer wine selection service drink place price time music experience staff bar special happy always hour date
Topic #1:
place love really like nice fun awesome cool favorite out friend hang to watch date game amazing looking absolutely recommend
Topic #2:
service very friendly staff nice always excellent attentive slow atmosphere server quick bartender awesome fast customer and helpful waitress super
Topic #3:
good food pretty really beer price drink selection very overall always decent amazing time atmosphere burger wine special fry pizza
Topic #4:
not bar time like one get drink beer really of night im best ive bad no in dont well the
Topic #5:
go back to definitely would want have cant come wait ill eat dont never try going id next get wrong

