In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens


In [3]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to /Users/nando/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/nando/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [9]:
import random
text_data = []
with open('./data/dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['analysis', 'price', 'competition', 'slot', 'resource', 'allocation']
['reconfigurable', 'array', 'concurrent', 'support', 'multiple', 'radio', 'standard', 'flexible', 'mapping']
['highly', 'programmable', 'infrastructure', 'prototyping', 'developing', 'deploy', 'genomics', 'centric', 'application']
['converter', 'fix', 'switching', 'frequency', 'adaptive', 'multi', 'control', 'scheme']
['acquisition', 'large', 'scale', 'surface', 'light', 'fields']
['smooth', 'polyhedron', 'using', 'implicit', 'algebraic', 'spline']
['truthful', 'incentive', 'mechanism', 'anonymity', 'location', 'privacy']
['dynamic', 'partial', 'reconfigurable', 'pruning', 'base', 'cognitive', 'radio']
['protdb', 'probabilistic']
['silicon', 'sapphire', 'voltage', 'temperature', 'sensor', 'energy', 'scavenger']
['wysiwyg', 'drawing', 'stroke', 'directly', 'model']
['complexity', 'reduction', 'polyphase', 'linear', 'phase', 'filter', 'symmetric', 'coefficient', 'implementation']
['streaming', 'code', 'channels', 'bur

In [16]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [17]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.041*"query" + 0.029*"radio" + 0.029*"analysis" + 0.028*"system"')
(1, '0.023*"energy" + 0.023*"sensor" + 0.023*"scavenger" + 0.023*"temperature"')
(2, '0.027*"delta" + 0.027*"modulator" + 0.027*"sigma" + 0.027*"base"')
(3, '0.035*"base" + 0.019*"code" + 0.019*"location" + 0.019*"using"')
(4, '0.015*"reduction" + 0.015*"learn" + 0.015*"element" + 0.015*"complexity"')


In [18]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(84, 1)]
[(0, 0.100019604), (1, 0.100029156), (2, 0.10001868), (3, 0.5999141), (4, 0.100018464)]


In [19]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.012*"location" + 0.012*"centric" + 0.012*"application" + 0.012*"genomics"')
(1, '0.026*"base" + 0.018*"query" + 0.018*"radio" + 0.018*"system"')
(2, '0.020*"using" + 0.020*"order" + 0.020*"power" + 0.011*"reconfigurable"')


In [20]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.056*"analysis" + 0.029*"explore" + 0.029*"route" + 0.029*"trade"')
(1, '0.049*"location" + 0.049*"anonymity" + 0.049*"truthful" + 0.049*"incentive"')
(2, '0.022*"multi" + 0.022*"control" + 0.022*"switching" + 0.022*"adaptive"')
(3, '0.049*"code" + 0.049*"isolate" + 0.049*"streaming" + 0.049*"erasure"')
(4, '0.042*"base" + 0.042*"radio" + 0.042*"reconfigurable" + 0.022*"linear"')
(5, '0.033*"model" + 0.033*"energy" + 0.033*"sensor" + 0.033*"system"')
(6, '0.006*"query" + 0.006*"language" + 0.006*"robot" + 0.006*"system"')
(7, '0.053*"sigma" + 0.053*"modulator" + 0.053*"delta" + 0.028*"order"')
(8, '0.053*"query" + 0.028*"using" + 0.028*"element" + 0.028*"clustering"')
(9, '0.049*"power" + 0.026*"line" + 0.026*"translation" + 0.026*"lookaside"')


In [22]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [23]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

In [24]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)