In [1]:
import pickle

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

### Load neighborhood descriptions

In [24]:
with open('../data/interim/04_neighborhoods.pkl', 'rb') as picklefile:
    df_neighborhoods = pickle.load(picklefile)

In [25]:
df_neighborhoods = df_neighborhoods[df_neighborhoods['neighborhood'] != 'OUTSIDE ZILLOW']

In [26]:
df_neighborhoods.reset_index(drop=True, inplace=True)

In [27]:
len(df_neighborhoods)

895

In [28]:
df_neighborhoods['loc'] = df_neighborhoods['neighborhood'].str.cat(df_neighborhoods['city'], sep=', ').str.cat(df_neighborhoods['state'], sep=', ')

In [29]:
neighborhoods = df_neighborhoods['loc']
docs_raw = df_neighborhoods['lemmas']

In [30]:
len(docs_raw)

895

### Convert to document-term matrix

In [31]:
tf_vectorizer = CountVectorizer(#strip_accents = 'unicode',
                                #stop_words = 'english',
                                #lowercase = True,
                                #token_pattern = r'\b[a-zA-Z]{3,}\b',
                                ngram_range=(1,2),
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
print(dtm_tf.shape)

(895, 15731)


In [32]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
print(dtm_tfidf.shape)

(895, 15731)


### Fit Latent Dirichlet Allocation models

In [33]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_topics=50, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_topics=50, random_state=0)
lda_tfidf.fit(dtm_tfidf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=50, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

### Visualizing the models with pyLDAvis

In [34]:
pyLDAvis.show(pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer))


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [22/Mar/2017 11:52:41] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2017 11:52:41] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2017 11:52:42] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2017 11:52:42] "GET /LDAvis.js HTTP/1.1" 200 -



stopping Server...


In [19]:
pyLDAvis.show(pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer))


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [22/Mar/2017 10:48:45] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2017 10:48:45] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2017 10:48:46] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [22/Mar/2017 10:48:46] "GET /LDAvis.js HTTP/1.1" 200 -



stopping Server...
