In [37]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [8]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sony\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [11]:
for w in ['dogs', 'ran', 'discouraged']:
    print(w, get_lemma(w), get_lemma2(w))

dogs dog dog
ran run ran
discouraged discourage discouraged


In [14]:
from nltk.corpus import stopwords
en_stop = set(stopwords.words('english'))

In [15]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [20]:
import random
text_data = []

with open('../Misc/dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['record', 'subtype', 'facility', 'database', 'system']
['reducing', 'spurious', 'tone', 'spectrum', 'sensing', 'architecture']
['cooperative', 'multihoming', 'user', 'access', 'point', '802.11', 'wlan']
['uncovering', 'locally', 'characterize', 'region', 'within', 'geotagged']
['shading', 'bicubic', 'patch']
['challenge', 'issue', 'result']
['efficient', 'decision', 'feedforward', 'equalizer', 'parallelizable', 'architecture']
['scalable', 'secret', 'generation', 'exploit', 'channel', 'phase', 'randomness', 'wireless', 'network']
['dynamic', 'cross', 'layer', 'association', '802.11-based', 'network']
['2-pin', 'input', 'multi', 'frequency', 'power', 'scavenge', 'wireless', 'sensor', 'node']
['class', 'reliable', 'base', 'transport', 'protocol', 'base', 'stochastic', 'approximation']
['bonding', 'triangular', 'spiral', 'inductor', 'switching', 'power', 'converter']
['bandwidth', 'allocation', 'virtual', 'path', 'investigation', 'performance', 'classical', 'constrain', 'genetic', 'algor

In [41]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [42]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [43]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [45]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [50]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.023*"network" + 0.023*"system" + 0.023*"region" + 0.023*"characterize"')
(1, '0.020*"decision" + 0.020*"investigation" + 0.020*"technique" + 0.020*"algorithm"')
(2, '0.035*"base" + 0.019*"architecture" + 0.019*"network" + 0.019*"transport"')
(3, '0.024*"architecture" + 0.024*"ratio" + 0.024*"oversampling" + 0.024*"double"')
(4, '0.042*"power" + 0.023*"system" + 0.023*"base" + 0.023*"modelling"')


In [51]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(70, 1), (92, 1)]
[(0, 0.40078756), (1, 0.3991186), (2, 0.06669347), (3, 0.06670118), (4, 0.06669918)]


In [52]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.029*"system" + 0.017*"decision" + 0.016*"behavioral" + 0.016*"implementation"')
(1, '0.026*"network" + 0.026*"base" + 0.015*"wireless" + 0.015*"platform"')
(2, '0.022*"architecture" + 0.022*"power" + 0.022*"base" + 0.012*"sensor"')


In [53]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.064*"filter" + 0.064*"expansion" + 0.064*"depth" + 0.064*"adaptive"')
(1, '0.039*"base" + 0.039*"power" + 0.039*"wireless" + 0.039*"multi"')
(2, '0.057*"spurious" + 0.057*"tone" + 0.057*"spectrum" + 0.057*"sensing"')
(3, '0.061*"network" + 0.032*"secret" + 0.032*"scalable" + 0.032*"generation"')
(4, '0.060*"system" + 0.060*"subtype" + 0.060*"database" + 0.060*"facility"')
(5, '0.023*"algorithm" + 0.023*"oversampling" + 0.023*"allocation" + 0.023*"classical"')
(6, '0.045*"headstage" + 0.045*"small" + 0.045*"animal" + 0.045*"issue"')
(7, '0.067*"base" + 0.035*"bonding" + 0.035*"spiral" + 0.035*"switching"')
(8, '0.031*"intake" + 0.031*"localize" + 0.031*"minimum" + 0.031*"macromodeling"')
(9, '0.095*"architecture" + 0.050*"decision" + 0.050*"feedforward" + 0.050*"metadata"')


In [54]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [57]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [58]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

In [59]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

Topic Modelling in Python with NLTK and Gensim    
https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21