In [11]:
import nltk
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
import re

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [12]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [13]:
en_stop = set(nltk.corpus.stopwords.words('english'))

In [14]:
def prepare_text_for_lda(text):
    text = re.sub('i.e.', 'ie', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r"""
                   [,;@#?!&$\"\'\[\]]+  # Accept one or more copies of punctuation
                   \ *           # plus zero or more copies of a space,
                   """,
                   " ",          # and replace it with a single space
                   text, flags=re.VERBOSE)
    text = re.sub(' +', ' ', text)
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [15]:
import random
text_data = []
with open('../data/test4.txt') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        text_data.append(tokens)
    

In [16]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [17]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [18]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [19]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [20]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.057*"discourse" + 0.049*"state" + 0.049*"attentional" + 0.042*"segment"')
(1, '0.038*"document" + 0.026*"discuss" + 0.026*"look" + 0.026*"dirichlet"')
(2, '0.076*"discourse" + 0.053*"structure" + 0.035*"segment" + 0.030*"intention"')
(3, '0.006*"discourse" + 0.006*"segment" + 0.006*"intention" + 0.006*"structure"')
(4, '0.108*"topic" + 0.075*"document" + 0.058*"threads" + 0.042*"connect"')


In [21]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.039*"discourse" + 0.033*"attentional" + 0.033*"state" + 0.032*"segment"')
(1, '0.067*"threads" + 0.065*"document" + 0.062*"topic" + 0.039*"connect"')
(2, '0.053*"discourse" + 0.034*"structure" + 0.034*"topic" + 0.023*"segment"')


In [22]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.006*"distribution" + 0.006*"look" + 0.006*"behind" + 0.006*"different"')
(1, '0.104*"relationship" + 0.081*"stack" + 0.042*"space" + 0.042*"push"')
(2, '0.006*"level" + 0.006*"topic" + 0.006*"threads" + 0.006*"document"')
(3, '0.057*"relate" + 0.057*"add" + 0.006*"topic" + 0.005*"stack"')
(4, '0.006*"level" + 0.006*"threads" + 0.006*"topic" + 0.006*"document"')
(5, '0.087*"discourse" + 0.051*"segment" + 0.044*"intention" + 0.044*"state"')
(6, '0.068*"discourse" + 0.068*"structure" + 0.035*"theory" + 0.035*"research"')
(7, '0.074*"tutorial" + 0.039*"threads" + 0.039*"analysis" + 0.039*"include"')
(8, '0.102*"topic" + 0.085*"document" + 0.060*"threads" + 0.052*"connect"')
(9, '0.031*"structure" + 0.031*"discourse" + 0.031*"segment" + 0.031*"document"')


In [23]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))

In [24]:
import pyLDAvis.gensim

In [25]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [26]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
