https://github.com/susanli2016/Machine-Learning-with-Python

In [2]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [3]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/daveyproctor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [5]:
for w in ['dogs', 'ran', 'discouraged']:
    print(w, get_lemma(w), get_lemma2(w))

dogs dog dog
ran run ran
discouraged discourage discouraged


In [6]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daveyproctor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [8]:
import random
text_data = []
with open('data/dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['animate', 'history', 'building', 'bridge', 'discipline']
['porqpine', 'distribute', 'collaborative', 'search', 'engine']
['efficient', 'algorithm', 'mining', 'association', 'rule', 'large', 'database']
['sparql', 'sparql', 'continuous', 'query']
['gothic', 'glare', 'optimizer', 'dynamic', 'range', 'image', 'content', 'implementation', 'video']
['indexing', 'store', 'relational', 'database']
['energy', 'scalable', 'margin', 'propagation', 'base', 'analog', 'support', 'vector', 'machine']
['demonstration', 'scidb', 'science', 'orient']
['engineering', 'visual', 'software', 'circuit', 'board']
['parallel', 'architecture', 'tracing', 'embed', 'intersection', 'algorithm']
['sampling', 'clock', 'jitter', 'estimation', 'compensation', 'circuit']
['joint', 'relevance', 'freshness', 'learning', 'clickthroughs', 'search']
['empirical', 'evaluation', 'technique', 'measuring', 'available', 'bandwidth']
['supporting', 'ontology', 'base', 'semantic', 'match', 'rdbms']
['motion', 'compensation', 's

In [29]:
import pandas as pd

In [30]:
tweetsDF = pd.read_csv("data/tweets.csv")

In [52]:
rawTweets = tweetsDF.loc[:,"full_text"]

## Or, choosing to treat tweet sets as documents

In [53]:
rawTweets = tweetsDF.groupby('twitter_account')['full_text'].apply(lambda x: "%s" % ' --\n '.join(x))

In [54]:
preppedTweets = [prepare_text_for_lda(text) for text in rawTweets]

In [64]:
text_data = preppedTweets

In [65]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [66]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [67]:
import pickle
pickle.dump(corpus, open('data/corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

### Try 5 topics

In [70]:
import gensim
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('LDAModels/model5.gensim')

In [72]:
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.029*"SCREEN_NAME" + 0.009*"people" + 0.007*"support" + 0.007*"congress" + 0.006*"hawaii"')
(1, '0.001*"SCREEN_NAME" + 0.000*"today" + 0.000*"american" + 0.000*"house" + 0.000*"family"')
(2, '0.000*"SCREEN_NAME" + 0.000*"today" + 0.000*"american" + 0.000*"president" + 0.000*"people"')
(3, '0.000*"SCREEN_NAME" + 0.000*"today" + 0.000*"american" + 0.000*"trump" + 0.000*"great"')
(4, '0.000*"SCREEN_NAME" + 0.000*"today" + 0.000*"american" + 0.000*"community" + 0.000*"support"')
(5, '0.000*"SCREEN_NAME" + 0.000*"today" + 0.000*"american" + 0.000*"family" + 0.000*"great"')
(6, '0.000*"SCREEN_NAME" + 0.000*"american" + 0.000*"great" + 0.000*"today" + 0.000*"family"')
(7, '0.064*"SCREEN_NAME" + 0.008*"today" + 0.008*"american" + 0.006*"congress" + 0.006*"trump"')
(8, '0.017*"SCREEN_NAME" + 0.011*"right" + 0.011*"goodtrouble--" + 0.010*"never" + 0.009*"people"')
(9, '0.039*"SCREEN_NAME" + 0.011*"today" + 0.008*"american" + 0.007*"president" + 0.005*"family"')
(10, '0.000*"SCREEN_NAME" + 

In [43]:
corpus[1]

[(1, 1),
 (2, 1),
 (3, 1),
 (4, 2),
 (5, 3),
 (6, 1),
 (7, 1),
 (8, 2),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1)]

In [44]:
ldamodel.get_document_topics(corpus[1])

[(0, 0.010312753),
 (1, 0.010149187),
 (2, 0.010219812),
 (3, 0.7818724),
 (4, 0.18744586)]

In [45]:
text_data[1]

['trumpshutdown',
 'federal',
 'worker',
 'safety',
 'delay',
 'airport',
 'return',
 'process',
 'delay',
 'refund',
 'check',
 'senior',
 'veteran',
 'apply',
 'assistance',
 'process',
 'delay',
 'check',
 'endshutdown']

In [46]:
len(text_data)

8209

In [47]:
tweetsDF.loc[1,"full_text"]

'#TrumpShutdown not just about federal workers, it’s about all of us.  If we fly, safety &amp; delays at airports; if we file tax return, no one to process &amp; delays in refund check; if a Senior or veteran applying for assistance,  no one to process &amp; delay in checks.  #EndShutdown https://t.co/g16aYNihPU'

In [14]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(10, 1), (35, 1), (63, 1)]
[(0, 0.050020237), (1, 0.050021715), (2, 0.5497505), (3, 0.051872116), (4, 0.2983354)]


In [15]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('LDAModels/model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.031*"compensation" + 0.018*"frame" + 0.018*"motion" + 0.018*"decoding"')
(1, '0.035*"algorithm" + 0.035*"sparql" + 0.020*"search" + 0.020*"database"')
(2, '0.023*"circuit" + 0.023*"base" + 0.023*"threat" + 0.023*"eclipse"')


In [16]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('LDAModels/model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.071*"empirical" + 0.071*"available" + 0.071*"measuring" + 0.071*"evaluation"')
(1, '0.073*"base" + 0.038*"machine" + 0.038*"propagation" + 0.038*"energy"')
(2, '0.051*"database" + 0.051*"algorithm" + 0.051*"large" + 0.051*"association"')
(3, '0.071*"search" + 0.071*"joint" + 0.071*"clickthroughs" + 0.071*"relevance"')
(4, '0.047*"h.264/avc" + 0.047*"frame" + 0.047*"reference" + 0.047*"scheme"')
(5, '0.075*"engineering" + 0.075*"visual" + 0.075*"board" + 0.075*"software"')
(6, '0.037*"dynamic" + 0.037*"gothic" + 0.037*"content" + 0.037*"video"')
(7, '0.071*"compensation" + 0.071*"circuit" + 0.071*"jitter" + 0.071*"estimation"')
(8, '0.085*"sparql" + 0.045*"eclipse" + 0.045*"discipline" + 0.045*"network"')
(9, '0.010*"search" + 0.010*"algorithm" + 0.010*"query" + 0.010*"sparql"')


### pyLDAvis

In [17]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('data/corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('LDAModels/model5.gensim')

In [19]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

exception calling callback for <Future at 0x126e1b240 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "/Users/daveyproctor/.virtualenvs/PolySpeech/lib/python3.7/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/Users/daveyproctor/.virtualenvs/PolySpeech/lib/python3.7/site-packages/joblib/parallel.py", line 309, in __call__
    self.parallel.dispatch_next()
  File "/Users/daveyproctor/.virtualenvs/PolySpeech/lib/python3.7/site-packages/joblib/parallel.py", line 731, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/Users/daveyproctor/.virtualenvs/PolySpeech/lib/python3.7/site-packages/joblib/parallel.py", line 759, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/daveyproctor/.virtualenvs/PolySpeech/lib/python3.7/site-packages/joblib/parallel.py", line 716, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/dav

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker. The exit codes of the workers are {EXIT(1)}

In [20]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

exception calling callback for <Future at 0x126ec5b38 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "/Users/daveyproctor/.virtualenvs/PolySpeech/lib/python3.7/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/Users/daveyproctor/.virtualenvs/PolySpeech/lib/python3.7/site-packages/joblib/parallel.py", line 309, in __call__
    self.parallel.dispatch_next()
  File "/Users/daveyproctor/.virtualenvs/PolySpeech/lib/python3.7/site-packages/joblib/parallel.py", line 731, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/Users/daveyproctor/.virtualenvs/PolySpeech/lib/python3.7/site-packages/joblib/parallel.py", line 759, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/daveyproctor/.virtualenvs/PolySpeech/lib/python3.7/site-packages/joblib/parallel.py", line 716, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/dav

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker. The exit codes of the workers are {EXIT(1)}

In [25]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
