In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [6]:
import nltk

In [7]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [8]:
en_stop = set(nltk.corpus.stopwords.words('english'))

In [9]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [11]:
import random
text_data = []
with open('../data/dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['mrsioncase', 'glasses', 'mix', 'reality', 'showcase', 'surround', 'multiple', 'viewers']
['interactive', 'lighting', 'effects', 'using', 'point', 'cloud']
['record', 'subtype', 'facility', 'database', 'system']
['towards', 'theory', 'model', 'product', 'search']
['automate', 'synthesis', 'executable', 'service', 'composition', 'bpel4ws', 'process']
['locating', 'sensor', 'forest', 'study', 'greenorbs']
['spatio', 'temporal', 'access', 'method', 'timestamp', 'interval', 'query']
['network', 'performance', 'anomaly', 'detection', 'localization']
['design', 'reliable', '2&times;vdd', '3&times;vdd', 'series', 'parallel', 'charge', 'pump', 'nanoscale']
['novel', 'structure', 'design', 'cascade', 'continuous', 'delta', 'sigma', 'modulators']
['improvement', 'proportionate', 'algorithm']
['bifurcation', 'frequency', 'control', 'resonant', 'converter']
['structure', 'broad', 'topic']
['query', 'segmentation', 'search']
['stage', 'series', 'base', 'neural', 'network', 'approach', 'nonlinear',

In [12]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [13]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [14]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

### Try 5 topics

In [15]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [16]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.029*"architecture" + 0.029*"database" + 0.016*"client" + 0.016*"management"')
(1, '0.029*"method" + 0.029*"design" + 0.029*"multi" + 0.016*"domain"')
(2, '0.026*"query" + 0.026*"series" + 0.026*"base" + 0.026*"access"')
(3, '0.036*"design" + 0.020*"search" + 0.020*"structure" + 0.020*"performance"')
(4, '0.022*"service" + 0.022*"point" + 0.022*"using" + 0.022*"bpel4ws"')


In [17]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(64, 1)]
[(0, 0.10002429), (1, 0.5998906), (2, 0.100021824), (3, 0.10003004), (4, 0.10003324)]


In [18]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.026*"search" + 0.026*"network" + 0.018*"query" + 0.018*"approach"')
(1, '0.021*"design" + 0.021*"method" + 0.020*"client" + 0.020*"control"')
(2, '0.021*"sigma" + 0.021*"delta" + 0.021*"database" + 0.012*"structure"')


In [19]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.064*"sensor" + 0.033*"query" + 0.033*"performance" + 0.033*"method"')
(1, '0.032*"multi" + 0.032*"optimize" + 0.032*"asynchronous" + 0.032*"neighbor"')
(2, '0.031*"method" + 0.031*"design" + 0.031*"dynamics" + 0.031*"glasses"')
(3, '0.058*"structure" + 0.058*"broad" + 0.058*"topic" + 0.005*"query"')
(4, '0.050*"service" + 0.026*"approach" + 0.026*"filter" + 0.026*"noise"')
(5, '0.027*"digital" + 0.027*"fractional" + 0.027*"variable" + 0.027*"delay"')
(6, '0.066*"search" + 0.045*"query" + 0.023*"network" + 0.023*"component"')
(7, '0.030*"design" + 0.030*"series" + 0.030*"database" + 0.030*"reliable"')
(8, '0.038*"control" + 0.038*"using" + 0.038*"management" + 0.038*"client"')
(9, '0.058*"sigma" + 0.058*"delta" + 0.031*"point" + 0.031*"design"')


### pyLDAvis

In [22]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [23]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [24]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [25]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
