In [1]:
!pip install nltk
!pip install tomotopy



In [2]:
import tomotopy
import nltk

## Access XML data for topic model

In [3]:
import xml.etree.cElementTree as et

In [4]:
article_tree = et.parse("data/articles-training-byarticle-20181122.xml")
publisher_tree = et.parse("data/articles-training-bypublisher-20181122.xml")

article_root = article_tree.getroot()
publisher_root = publisher_tree.getroot()

In [5]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stoplist = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fwright/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
print(stoplist)

{'aren', 'about', 'all', 'by', 'against', 'such', "aren't", 'under', 'while', 'where', 'does', 'not', 's', 'hers', "weren't", 'are', 'before', 'don', 'what', 'has', 'on', 'own', 'your', 'each', "hasn't", 'after', 'further', 'during', 'here', "haven't", 'if', 'shan', "you'd", 'above', "you're", 'wouldn', 'until', 'wasn', 'for', 'yours', 'some', 'was', "couldn't", 'very', 'a', 'he', 'do', "she's", 'were', 'ourselves', 'no', 'then', 'shouldn', 'ours', 'but', 'their', 'or', "don't", 'that', 'to', 'up', 'than', 'll', 'those', 'whom', 'our', 'when', "you'll", 'why', 'will', 'been', 'at', 'theirs', 'did', 'who', 'just', 've', 'me', 'themselves', "should've", 'with', 'you', 'its', 'herself', 'again', 'ain', 'over', 'any', 'same', 'm', 'doesn', 'ma', 'they', "wouldn't", 'most', "mustn't", 'them', 'won', 're', 'hasn', 'mustn', 'into', 'how', 'it', 'both', 'few', 'weren', 'only', 'him', "wasn't", 'couldn', 'd', 'these', 'because', 'and', "shouldn't", 'can', 'so', 'having', "needn't", 'yourselves'

In the cell below, we get the text out of each of our articles to pass into the topic model. We apply lowercase to all of our words so that words with capital letters are treated the same as their lowercase counterparts, and so that words with capital letters will also be checked against the stoplist.

In [12]:
article_list = []

for article in article_root:
    article_text = []
    for paragraph in article:
        if paragraph.text is not None:
            article_text += [word for word in paragraph.text.lower().split() if word not in stoplist and word.isalpha()]
    article_list += [article_text]
print(len(article_list))

645


In [13]:
for article in publisher_root:
    article_text = []
    for paragraph in article:
        if paragraph.text is not None:
            article_text += [word for word in paragraph.text.lower().split() if word not in stoplist and word.isalpha()]
    article_list += [article_text]
print(len(article_list))

600645


## Topic Modeling

In [14]:
n_topics = 10
n_docs = len(article_list)

mdl = tomotopy.LDAModel(k=n_topics)
for article in article_list:
  mdl.add_doc(article)

In [15]:
iters_per_check = 50
for i in range(0, 1000, iters_per_check):
    mdl.train(iters_per_check)
    print('Iteration: {}\tLog-likelihood: {}'.format(i+iters_per_check, mdl.ll_per_word))

Iteration: 50	Log-likelihood: -9.205508485789876
Iteration: 100	Log-likelihood: -9.176238038682477
Iteration: 150	Log-likelihood: -9.166043252812388
Iteration: 200	Log-likelihood: -9.160990202818681
Iteration: 250	Log-likelihood: -9.151298462028565
Iteration: 300	Log-likelihood: -9.145877663961532
Iteration: 350	Log-likelihood: -9.142571228333345
Iteration: 400	Log-likelihood: -9.139582995243835
Iteration: 450	Log-likelihood: -9.137798502337274
Iteration: 500	Log-likelihood: -9.136645595088805
Iteration: 550	Log-likelihood: -9.135171346654491
Iteration: 600	Log-likelihood: -9.133254441994875
Iteration: 650	Log-likelihood: -9.13181960538584
Iteration: 700	Log-likelihood: -9.130851064191335
Iteration: 750	Log-likelihood: -9.13033692686749
Iteration: 800	Log-likelihood: -9.129721959549837
Iteration: 850	Log-likelihood: -9.129472683605906
Iteration: 900	Log-likelihood: -9.129412680815614
Iteration: 950	Log-likelihood: -9.12939344539318
Iteration: 1000	Log-likelihood: -9.129210351095743


In [16]:
# Print top 25 words of each topic
print("Top 25 words by topic")
for k in range(n_topics):
    print('#{}: {}'.format(k, ' '.join([w for (w, prop) in mdl.get_topic_words(k, top_n=10)])))

Top 25 words by topic
#0: first game last two team points second said coach three
#1: would tax health new percent million could money people pay
#2: like one people get think know going time would really
#3: us one people even would american political many war like
#4: said police court two state told law according say one
#5: said new water energy also oil could climate one would
#6: company said percent stock market continue new last billion million
#7: said military government united president would also security international north
#8: trump president said would house obama republican white campaign new
#9: new school said students city church mexico also schools university


In [17]:
# Survey stats. Notably, I'm using all the data as training data - you can also
# generate metrics using the mdl.infer() method if you want to use held-out
# data to see if the topic model generalizes well.
# See https://bab2min.github.io/tomotopy/v0.4.1/en/#tomotopy.LDAModel.infer
mdl.summary()

<Basic Info>
| LDAModel (current version: 0.12.7)
| 597652 docs, 145742375 words
| Total Vocabs: 429180, Used Vocabs: 429180
| Entropy of words: 8.87935
| Entropy of term-weighted words: 8.87935
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 1000, Burn-in steps: 0
| Optimization Interval: 10
| Log-likelihood per word: -9.12921
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k: 10 (the number of topics between 1 ~ 32767)
| alpha: [0.1] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k` of `float` in case of asymmetric prior.)
| eta: 0.01 (hyperparameter of Dirichlet distribution for topic-word)
| seed: 3315092819 (random seed)
| trained in version 0.12.7
|
<Parameters>
| alpha (Dirichlet prior on the per-document topic distributions)
|  