In [41]:
import pickle
import pandas as pd
from spacy.en import English, STOPWORDS
import re
import pyLDAvis
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem import WordNetLemmatizer 
import numpy as np

#### 1. Load tweets

In [2]:
with open('tweets/kanye_tweets.pkl', 'r') as picklefile:
    kanye = pickle.load(picklefile)

In [5]:
kanye_df = pd.DataFrame(kanye)
keep = ['id', 'text']
kanye_df = kanye_df[keep]

In [53]:
kanye_df.text

0               I'm going to steal Demna from Balenciaga
1      You won't always agree with me but I'm gon alw...
2      The world needs a guy like me. The world needs...
3      I love being a voice of freedom when so many p...
4                     I represent what people can't say.
5                           New album coming this summer
6      I'm not even gon lie to you. I love me so much...
7      And yes I've talked to Adidas and we gon hook ...
8      That is one of the reasons I respect the paps ...
9      Respect to everyone working hard to keep a roo...
10     What is your definition of true freedom? There...
11     I just had to start with shoes so that I could...
12     You guys will see my heart. You guys will feel...
13     It's a beautiful time.  I love my friends. I l...
14          Free from being held back by public opinion.
15     I'm happy and free and proud and confident. I'...
16     I'm proud as an entrepreneur that I gave every...
17     I wrote Saint Pablo afte

#### 2. Get tokens

In [8]:
nlp = English()

In [9]:
def get_tokens(text):  
    """Return lemmatized tokens."""
    text = re.sub(r"http\S+", "", text) 
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    words = ' '.join(letters_only.lower().split())
    try:
        tokens = [token.lemma_ for token in nlp(words)] 
    except:
        tokens = [token.lemma_ for token in nlp(words.decode('utf8'))] 
    filtered = [t for t in tokens if t != '' and t != ' ' and t != '\n' and t != '\n\n']
    return ' '.join(filtered)

In [12]:
kanye_df.loc[:, 'tokens'] = kanye_df.loc[:, 'text'].apply(get_tokens)

#### 3. Create LDA model / prep pyLDAvis inputs
**Requirements**
* topic term distribution
* document topic distribution
* document lengths
* list of words in corpus
* term frequency

In [60]:
def prep_pylda(docs, n_topics = 5):   
    vect = TfidfVectorizer(max_df = 0.5, max_features = 10000,
                                 min_df = 5, stop_words = STOPWORDS,
                                 use_idf = True, tokenizer = None, ngram_range=(1, 3))
    matrix = vect.fit_transform(docs)
    vocab = vect.get_feature_names()
    
    # fit transform lda
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                learning_method='online', learning_offset=50.,
                random_state=0, doc_topic_prior = .001)
    doc_topic_dists = lda.fit_transform(matrix)
    
    # prepare pyLDAvis stuff 
    get_normed = lambda data: pd.DataFrame(data).div(data.sum(axis = 1), axis = 0) 
    prepared = pyLDAvis.prepare(
            doc_lengths = docs.str.len(),
            vocab = vocab,
            term_frequency = np.asarray(matrix.sum(axis = 0)).ravel().tolist(),
            topic_term_dists = get_normed(lda.components_),  
            doc_topic_dists = get_normed(doc_topic_dists)) 
    
    return prepared

In [75]:
prepared = prep_pylda(kanye_df.tokens, n_topics = 3)

#### 4. Create pyLDAvis
* how prevalent is each topic?
* how do topics relate to one another?
* what is the meaning of each topic? what words commonly occur?

In [76]:
# display in notebook
pyLDAvis.display(prepared)

In [74]:
# export to html
pyLDAvis.save_html(prepared, 'kanye.html')

#### 5. More info

* https://pyldavis.readthedocs.org
* http://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf
* https://cran.r-project.org/web/packages/LDAvis/vignettes/details.pdf