In [1]:
import pickle
import pandas as pd
import operator

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

### Load neighborhood descriptions

In [3]:
with open('../data/interim/04_neighborhoods_lemmas.pkl', 'rb') as picklefile:
    df_neighborhoods = pickle.load(picklefile)

In [4]:
df_neighborhoods = df_neighborhoods[df_neighborhoods['neighborhood'] != 'OUTSIDE ZILLOW']

In [5]:
df_neighborhoods.reset_index(drop=True, inplace=True)

In [6]:
len(df_neighborhoods)

895

In [7]:
df_neighborhoods['loc'] = df_neighborhoods['neighborhood'].str.cat(df_neighborhoods['city'], sep=', ').str.cat(df_neighborhoods['state'], sep=', ')

In [8]:
neighborhoods = df_neighborhoods['loc']
docs_raw = df_neighborhoods['lemmas']

In [9]:
len(docs_raw)

895

### Convert to document-term matrix

In [10]:
tf_vectorizer = CountVectorizer(#strip_accents = 'unicode',
                                #stop_words = 'english',
                                #lowercase = True,
                                #token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_features = 500,
                                ngram_range=(1,2),
                                max_df = 0.5, 
                                min_df = 20)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
print(dtm_tf.shape)

(895, 500)


### Fit Latent Dirichlet Allocation models

In [11]:
lda_tf = LatentDirichletAllocation(n_topics=8, random_state=0)
lda_tf.fit(dtm_tf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=8, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

### Visualizing the models with pyLDAvis

In [12]:
pyLDAvis.show(pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer))


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [29/Mar/2017 20:15:37] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [29/Mar/2017 20:15:37] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [29/Mar/2017 20:15:37] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [29/Mar/2017 20:15:38] "GET /LDAvis.js HTTP/1.1" 200 -
127.0.0.1 - - [29/Mar/2017 20:15:38] code 404, message Not Found
127.0.0.1 - - [29/Mar/2017 20:15:38] "GET /favicon.ico HTTP/1.1" 404 -



stopping Server...


### Assign clusters

In [37]:
topic_word = lda_tf.components_

In [38]:
doc_topic = lda_tf.transform(dtm_tf)

In [39]:
df_topics = pd.DataFrame(doc_topic)

In [40]:
df_topics['topic'] = df_topics.idxmax(axis=1)

In [41]:
df_topics['neighborhood'] = df_neighborhoods['neighborhood']
df_topics['city'] = df_neighborhoods['city']

In [42]:
df_topics = df_topics[['neighborhood', 'city', 'topic']]

In [43]:
with open('../data/interim/06_topics.pkl', 'wb') as picklefile:
    pickle.dump(df_topics, picklefile)

In [69]:
#csv for d3 visualization of stemplots
df_lda_topic_word = pd.DataFrame(topic_word).T

vocab = tf_vectorizer.vocabulary_
sorted_lda_words = sorted(vocab.items(), key=operator.itemgetter(1))
indices = [x[0].encode('utf-8') for x in sorted_lda_words]

df_lda_topic_word['word'] = indices

#topic_word importances
df_lda_topic_word.to_csv('../reports/viz/data/lda_topic_word.csv', index=False)

In [63]:




#words
df_lda_words = pd.DataFrame([x[0].encode('utf-8') for x in sorted(vocab.items(), key=operator.itemgetter(1))])
df_lda_words.to_csv('../reports/viz/data/lda_words.csv', index=False)

In [67]:
df_lda_topic_word.head()

Unnamed: 0,0,1,2,3,4,5,6,7,words
0,9.988002,0.218289,0.193449,1.750816,280.576392,0.252007,135.838667,0.194037,academy
1,119.744616,131.378484,56.79335,187.59779,82.608184,37.274778,150.976403,137.087976,accessible
2,42.219797,120.563459,133.977459,109.96235,7.920506,67.003289,85.735213,9.752491,action
3,30.998187,46.641979,35.575133,102.580729,20.093226,64.636555,57.916566,32.668698,active
4,86.123627,150.474835,78.066619,154.675887,137.556536,389.597858,143.409593,56.983045,activity
