In [1]:
from time import time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pickle

In [12]:
n_features = 5000
n_topics = 12
n_top_words = 20

In [3]:
def load_data():
    print("Loading dataset...")
    data = pickle.load( open( "business_docs.p", "rb" ) )
    return data

In [4]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [5]:
review_data = load_data()

Loading dataset...


In [6]:
review_data_businesses = [b_id for (b_id, txt) in review_data]
review_data_text = [txt for (b_id, txt) in review_data]

In [13]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(review_data_text)
print("done in %0.3fs." % (time() - t0))

done in 37.546s.


In [14]:
print("Fitting LDA models with tf features, and n_features=%d..."
      % (n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

Fitting LDA models with tf features, and n_features=5000...
done in 175.221s.


In [15]:
lda_out = lda.transform(tf)

In [16]:
feature_names = tf_vectorizer.get_feature_names()

In [17]:
print_top_words(lda, feature_names, n_top_words)

Topic #0:
hair great place time did massage salon nails just like ve good amazing job cut really best love got nail
Topic #1:
like just room pool time great place nice people area really park vegas clean don airport day staff parking free
Topic #2:
hotel room stay la et le casino strip nice service est rooms stayed montreal vegas night les place und pour
Topic #3:
food good great service restaurant menu just place ordered like dinner really time table nice came meal salad server delicious
Topic #4:
store like place great just love coffee shop good selection really don time ve staff friendly location little nice prices
Topic #5:
service car time did customer great just work told said called new got went company didn took came day job
Topic #6:
food good place like service great sushi just chicken time rice restaurant really order ordered ve try best fresh eat
Topic #7:
food good place burger just great like service breakfast fries time really chicken order ordered got try wait delicious

In [18]:
pickle.dump( lda_out, open( "lda_Bmatrix.p", "wb" ) )

In [20]:
pickle.dump( feature_names, open( "lda_selected_features.p", "wb" ) )