In [1]:
!pip install nltk
!pip install tomotopy



In [1]:
import tomotopy
import nltk

## Access XML data for topic model

In [2]:
import xml.etree.cElementTree as et

In [13]:
article_training_tree = et.parse("data/articles-training-byarticle-20181122.xml")
publisher_training_tree = et.parse("data/articles-training-bypublisher-20181122.xml")
publisher_validation_tree = et.parse("data/articles-validation-bypublisher-20181122.xml")

article_training_root = article_training_tree.getroot()
publisher_training_root = publisher_training_tree.getroot()
publisher_validation_root = publisher_validation_tree.getroot()

In [14]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stoplist = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fwright/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
print(stoplist)

{'herself', 'having', 'too', 'should', 'doing', 'about', 'weren', 'ain', 'off', 'him', 'we', 'and', "you'd", 'itself', 'ours', 'under', 'between', 'have', 'to', "it's", 'during', 'how', "mightn't", 'be', 'again', 'yourselves', 'myself', 'which', 'she', 'all', 'do', 'no', 'his', 'had', 'some', 'below', 'only', 'same', "aren't", 'where', 'before', 'nor', 'who', 'if', 'until', 'for', 'what', 'both', 'by', 'above', 'himself', 'other', 'once', 'over', "she's", "you'll", 'or', 'whom', 'while', "weren't", 'my', 'her', 'but', 'on', 'don', 'here', "needn't", 'this', "shouldn't", 'i', 'hers', 'an', 'does', 'own', 'just', 've', 'why', 'with', "don't", 'me', "won't", 'few', 'been', 'wasn', 'doesn', 'very', 'your', 'yourself', 'll', 'then', 'in', 'the', 'will', 'aren', 'hadn', 'being', 'shan', 'after', 'down', "haven't", 'a', 'am', 'so', 'hasn', "you're", 'shouldn', 'them', 'its', 'is', 'from', 'more', 'that', 'can', "should've", 'most', 'mightn', 'ourselves', 'there', 'when', "wouldn't", 'are', 'h

In the cell below, we get the text out of each of our articles to pass into the topic model. We apply lowercase to all of our words so that words with capital letters are treated the same as their lowercase counterparts, and so that words with capital letters will also be checked against the stoplist.

In [16]:
article_list = []

for article in article_training_root:
    article_text = []
    for paragraph in article:
        if paragraph.text is not None:
            article_text += [word for word in paragraph.text.lower().split() if word not in stoplist and word.isalpha()]
    article_list += [article_text]
print(len(article_list))

645


In [17]:
for article in publisher_training_root:
    article_text = []
    for paragraph in article:
        if paragraph.text is not None:
            article_text += [word for word in paragraph.text.lower().split() if word not in stoplist and word.isalpha()]
    article_list += [article_text]
print(len(article_list))

600645


In [18]:
for article in publisher_validation_root:
    article_text = []
    for paragraph in article:
        if paragraph.text is not None:
            article_text += [word for word in paragraph.text.lower().split() if word not in stoplist and word.isalpha()]
    article_list += [article_text]
print(len(article_list))

750645


## Topic Modeling

In [19]:
n_topics = 10
n_docs = len(article_list)

mdl = tomotopy.LDAModel(k=n_topics)
for article in article_list:
  mdl.add_doc(article)

In [20]:
iters_per_check = 50
for i in range(0, 1000, iters_per_check):
    mdl.train(iters_per_check)
    print('Iteration: {}\tLog-likelihood: {}'.format(i+iters_per_check, mdl.ll_per_word))

Iteration: 50	Log-likelihood: -9.191219393310083
Iteration: 100	Log-likelihood: -9.165878648728706
Iteration: 150	Log-likelihood: -9.15899475218593
Iteration: 200	Log-likelihood: -9.15526140210349
Iteration: 250	Log-likelihood: -9.153324367351884
Iteration: 300	Log-likelihood: -9.152337779219758
Iteration: 350	Log-likelihood: -9.15157365945748
Iteration: 400	Log-likelihood: -9.151642561490215
Iteration: 450	Log-likelihood: -9.15105437756308
Iteration: 500	Log-likelihood: -9.150963935020132
Iteration: 550	Log-likelihood: -9.151049332226332
Iteration: 600	Log-likelihood: -9.150794449728673
Iteration: 650	Log-likelihood: -9.150846074384429
Iteration: 700	Log-likelihood: -9.150788928092108
Iteration: 750	Log-likelihood: -9.150897518997581
Iteration: 800	Log-likelihood: -9.15086211854652
Iteration: 850	Log-likelihood: -9.150927204039723
Iteration: 900	Log-likelihood: -9.150922063091924
Iteration: 950	Log-likelihood: -9.151005327765473
Iteration: 1000	Log-likelihood: -9.150957556647533


In [21]:
# Print top 25 words of each topic
print("Top 25 words by topic")
for k in range(n_topics):
    print('#{}: {}'.format(k, ' '.join([w for (w, prop) in mdl.get_topic_words(k, top_n=10)])))

Top 25 words by topic
#0: tax would percent economic money financial government health pay could
#1: trump president said obama republican would house campaign white republicans
#2: new one like first said also people says two time
#3: first game two last team points said second one coach
#4: said new water energy climate could also one oil people
#5: said police told two according one say people man also
#6: united military government president war would said states us international
#7: people one like would even think us many get know
#8: state said new would school public law court federal also
#9: company said percent stock new continue million last sales market


In [22]:
# Survey stats. Notably, I'm using all the data as training data - you can also
# generate metrics using the mdl.infer() method if you want to use held-out
# data to see if the topic model generalizes well.
# See https://bab2min.github.io/tomotopy/v0.4.1/en/#tomotopy.LDAModel.infer
mdl.summary()

<Basic Info>
| LDAModel (current version: 0.12.7)
| 746557 docs, 187266574 words
| Total Vocabs: 477650, Used Vocabs: 477650
| Entropy of words: 8.86260
| Entropy of term-weighted words: 8.86260
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 1000, Burn-in steps: 0
| Optimization Interval: 10
| Log-likelihood per word: -9.15096
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k: 10 (the number of topics between 1 ~ 32767)
| alpha: [0.1] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k` of `float` in case of asymmetric prior.)
| eta: 0.01 (hyperparameter of Dirichlet distribution for topic-word)
| seed: 2643793627 (random seed)
| trained in version 0.12.7
|
<Parameters>
| alpha (Dirichlet prior on the per-document topic distributions)
|  