# Latent Dirichlet Allocation (LDA) Topic Modeling
This notebook uses LDA to perform topic modeling on the corpus and visualizes the clusters.

In [12]:
import pickle
import pyLDAvis, pyLDAvis.sklearn

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem import SnowballStemmer

from IPython.display import display

In [18]:
def print_topic_top_words(model, cv, n_top_words=10):
    feature_names = cv.get_feature_names()
    
    for topic_vec in enumerate(model.components_):
        topic_num = topic_vec[0]
        topic_words = topic_vec[1]
        
        print('Topic {}:'.format(topic_num + 1))
        
        topic_values = sorted(zip(topic_words, feature_names), 
                              reverse=True)[:n_top_words]
    
        print(' '.join([y for x,y in topic_values]))
        
    return


def get_topics_all(corpus, n_topics=5, n_iter=10):
    
    all_text = []
    
    for _,v in corpus.items():
        stemmer = SnowballStemmer('english')
        stemmed_text = [[stemmer.stem(token) for token in text] for text in v]
        joined_text = [" ".join(stems) for stems in stemmed_text]
        
        all_text += joined_text
    
    count_vectorizer = CountVectorizer(stop_words='english', 
                                       token_pattern="\\b[a-z][a-z]+\\b")

    X = count_vectorizer.fit_transform(all_text)
    
    lda = LatentDirichletAllocation(n_topics=n_topics,
                                    max_iter=n_iter)
    
    lda.fit(X)
    
    print_topic_top_words(lda, count_vectorizer, n_top_words)
    
    return (count_vectorizer, X, lda)

### Load in the corpus

In [3]:
corpus_ads = pickle.load(open('../data/pickle_jar/corpus_ads_1.p','rb'))

### Build dictionary of stems by year

In [5]:
ad_text = {k: [[x[0] for x in v[i]] for i in range(len(v))] for k,v in corpus_ads.items()} 

### Perform 3-topic LDA reduction

In [19]:
n_top_words = 20

for n in range(3,4):
    print('Topics: {}'.format(n))
    cv, X, lda = get_topics_all(ad_text,n_topics=n)

Topics: 3




Topic 1:
old mani young long littl good best great black high later alway realli away right real big small final white
Topic 2:
isra islam palestinian militari mani laden israel qaeda al sever recent ist polit british arab prime intern foreign west afghanistan
Topic 3:
militari nation mani polit public feder white general intern bush republican long foreign right legal recent democrat clear nuclear far


### Visualize topics

In [21]:
# Setup to run in Jupyter notebook
pyLDAvis.enable_notebook()

# Create the visualization
vis = pyLDAvis.sklearn.prepare(lda, X, cv)

# Let's view it!
display(vis)