In [77]:
!pip install nltk
!pip install tomotopy



In [45]:
import tomotopy
import nltk

## Access XML data for topic model

In [46]:
import xml.etree.cElementTree as et

In [47]:
article_tree = et.parse("data/articles-training-byarticle-20181122.xml")
publisher_tree = et.parse("data/articles-training-bypublisher-20181122.xml")

article_root = article_tree.getroot()
publisher_root = publisher_tree.getroot()

In [74]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stoplist = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fwright/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [75]:
print(stoplist)

{'further', 'but', 'having', 'very', 't', 'or', 'isn', 'y', 'had', 'can', "wasn't", 'after', 'wouldn', 'before', 'itself', 'each', 'haven', 'do', "hadn't", 'yours', 'just', 'nor', 'and', 'will', 've', 'she', 'then', 's', 'both', 'him', 'there', 'how', 'ain', 'all', 'because', 'mightn', 'again', 'i', 'ours', 'over', 'we', 'off', 'you', 'being', 'weren', 'didn', 'which', 'no', 'this', 'on', 'only', 'he', 'same', "don't", 'with', 'hasn', 'at', "shouldn't", 'll', 'be', 'our', 'needn', 'ourselves', 'shouldn', 'aren', 'does', 'until', 'themselves', "isn't", 'so', "didn't", 'of', 'them', 'mustn', 'doing', 'to', "aren't", 'in', 'when', 'about', 'theirs', 'these', 'too', "you're", 'should', 'it', 'now', 'as', 'an', 'up', "that'll", 'more', 'your', 'few', 'that', 'ma', 'has', 'was', "you've", "you'd", 'm', "weren't", "should've", 'above', 'its', "hasn't", 'd', 'yourselves', "she's", 'herself', "couldn't", 'their', 'were', 'own', 'during', 'doesn', 'by', "needn't", 'am', 'some', 'any', 'into', 'm

In the cell below, we get the text out of each of our articles to pass into the topic model. We apply lowercase to all of our words so that words with capital letters are treated the same as their lowercase counterparts, and so that words with capital letters will also be checked against the stoplist.

In [76]:
article_list = []

for article in article_root:
    article_text = []
    for paragraph in article:
        if paragraph.text is not None:
            article_text += [word for word in paragraph.text.lower().split() if word not in stoplist and word.isalpha()]
    article_list += [article_text]

## Topic Modeling

In [79]:
# movie reviews dataset, TODO: remove
nltk.download('movie_reviews')
nltk.download('punkt')
from nltk.corpus import movie_reviews
file_ids = movie_reviews.fileids()

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/fwright/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /Users/fwright/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [80]:
# 10-topic model populated with documents from our reviews (with stopwords removed)
n_topics = 10
n_docs = len(article_list)

mdl = tomotopy.LDAModel(k=n_topics)
for article in article_list:
  mdl.add_doc(article)

In [81]:
# Most converging will happen fast, but we'll run for 1000 iterations just in case
# (this will take a minute)
iters_per_check = 50
for i in range(0, 1000, iters_per_check):
    mdl.train(iters_per_check)
    print('Iteration: {}\tLog-likelihood: {}'.format(i+iters_per_check, mdl.ll_per_word))

Iteration: 50	Log-likelihood: -9.032468042409862
Iteration: 100	Log-likelihood: -8.948392593473928
Iteration: 150	Log-likelihood: -8.906582864824466
Iteration: 200	Log-likelihood: -8.899380021034613
Iteration: 250	Log-likelihood: -8.888851858546053
Iteration: 300	Log-likelihood: -8.872216646944098
Iteration: 350	Log-likelihood: -8.86752147324241
Iteration: 400	Log-likelihood: -8.859647285779737
Iteration: 450	Log-likelihood: -8.862539261893579
Iteration: 500	Log-likelihood: -8.861383730908267
Iteration: 550	Log-likelihood: -8.856491308112238
Iteration: 600	Log-likelihood: -8.855941623347977
Iteration: 650	Log-likelihood: -8.852751379981433
Iteration: 700	Log-likelihood: -8.853367578415355
Iteration: 750	Log-likelihood: -8.85422254582576
Iteration: 800	Log-likelihood: -8.844701276642022
Iteration: 850	Log-likelihood: -8.84463768760105
Iteration: 900	Log-likelihood: -8.845150533087159
Iteration: 950	Log-likelihood: -8.841082747211596
Iteration: 1000	Log-likelihood: -8.849710062506954


In [82]:
# Print top 25 words of each topic
print("Top 25 words by topic")
for k in range(n_topics):
    print('#{}: {}'.format(k, ' '.join([w for (w, prop) in mdl.get_topic_words(k, top_n=10)])))

Top 25 words by topic
#0: hurricane global florida warming still gold also climate data natural
#1: people like think one know going get time want say
#2: one would us even also media government political many article
#3: money illegal law federal country must million immigration border state
#4: trump president said donald clinton obama hillary white new house
#5: police said people daily free man officers message according las
#6: white black america left people american class liberal violence world
#7: california los nfl players national israel anthem many stand protest
#8: clinton fbi news investigation department former said comey hillary emails
#9: north press attack war military united news muslim korea august


In [14]:
# Survey stats. Notably, I'm using all the data as training data - you can also
# generate metrics using the mdl.infer() method if you want to use held-out
# data to see if the topic model generalizes well.
# See https://bab2min.github.io/tomotopy/v0.4.1/en/#tomotopy.LDAModel.infer
mdl.summary()

<Basic Info>
| LDAModel (current version: 0.12.7)
| 2000 docs, 701543 words
| Total Vocabs: 38738, Used Vocabs: 38738
| Entropy of words: 8.65166
| Entropy of term-weighted words: 8.65166
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 1000, Burn-in steps: 0
| Optimization Interval: 10
| Log-likelihood per word: -9.09719
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k: 10 (the number of topics between 1 ~ 32767)
| alpha: [0.1] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k` of `float` in case of asymmetric prior.)
| eta: 0.01 (hyperparameter of Dirichlet distribution for topic-word)
| seed: 1247601995 (random seed)
| trained in version 0.12.7
|
<Parameters>
| alpha (Dirichlet prior on the per-document topic distributions)
|  [0.5152