# Topic Modeling of Historical Organic Periodicals

In [1]:
from __future__ import print_function
import pickle
import subprocess
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

## Learn topic models

In [3]:
cmd = "python -m perysis.extract_topics -input_dir ../documents_processed -output_dir ../models -num_features 2000 -num_topics 20"
subprocess.check_call(cmd, shell=True)

0

## Load document-term matrices (DTM)

In [4]:
input_dir = "../models"
# TF
dtm_file = open("%s/dtm_tf.pkl" % input_dir, "rb")
tf_vectorizer = pickle.load(dtm_file)
dtm_tf = pickle.load(dtm_file)
# TF-IDF
dtm_file = open("%s/dtm_tfidf.pkl" % input_dir, "rb")
tfidf_vectorizer = pickle.load(dtm_file)
dtm_tfidf = pickle.load(dtm_file)

### Visualization of top words for each topic

In [5]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % (topic_idx + 1)
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

## Load learned topic models

### Non-negative Matrix Factorization (NMF) model with Frobenius norm

In [6]:
model_file = open("%s/nmf_frobenius.pkl" % input_dir, "rb")
W = pickle.load(model_file)
model = pickle.load(model_file)
print([idx for idx, _ in enumerate(model.components_) if model.components_[idx].sum() < 1e-5])
print_top_words(model, tfidf_vectorizer.get_feature_names(), 30)
pyLDAvis.sklearn.prepare(model, dtm_tfidf, tfidf_vectorizer)

[]
Topic #0: gm organic soybean say seed corn soy crop said company grain genetically source ingredient feed eu bean email oil consumer wheat testing variety labeling industry european report rice non natural
Topic #1: milk dairy price class milkweed cheese nonfat usda cwt month butter federal dry producer cheddar pound cost cow order january coop supply commodity sale powder nation iii industry higher report
Topic #2: organic crop soil farming agriculture plant research standard broadcaster use study sustainable water pesticide said producer page natural genetically program genetic news national brief livestock garden available conventional manure spring
Topic #3: crop gene engineered plant virus epa resistance genetically insect bacteria scientist herbicide risk cotton weed control pest agriculture toxin field biotechnology vaccine usda squash company environmental organism genetic wild agency
Topic #4: seed variety plant patent hybrid broadcaster breeder seedling company breeding pl

### NMF model with Kullback-Leibler (KL) divergence

In [7]:
model_file = open("%s/nmf_kl.pkl" % input_dir, "rb")
W = pickle.load(model_file)
model = pickle.load(model_file)
print([idx for idx, _ in enumerate(model.components_) if model.components_[idx].sum() < 1e-5])
print_top_words(model, tfidf_vectorizer.get_feature_names(), 30)
pyLDAvis.sklearn.prepare(model, dtm_tfidf, tfidf_vectorizer)

[]
Topic #0: able address addition absence advantage milling area bean minnesota best natural better national import believe lion ingredient marketing missouri major nearly nature american institute nation planted based adopt agency introduced
Topic #1: milk way milkweed cheese week western cheddar weather exchange weekly change example summer washington marketer virtually processor supply madison nonfat meeting set sup na month foreign major volume demand synthetic
Topic #2: organic volume broadcaster brief farming environmental plant genetically soil used change act number research spring business engineered resource sure page control pesticide newsletter allowed crop pest agricultural animal early insect
Topic #3: academy director newsletter management page organism office approved earlier organization nation paper biotechnology president concern modified executive force better reach calling document blood avoid called agriculture stated european end released
Topic #4: like net feed

### Latent Dirichlet Allocation

In [8]:
model_file = open("%s/lda_tf.pkl" % input_dir, "rb")
W = pickle.load(model_file)
model = pickle.load(model_file)
print_top_words(model, tf_vectorizer.get_feature_names(), 30)
pyLDAvis.sklearn.prepare(model, dtm_tf, tf_vectorizer)

Topic #0: fda milk cow human hormone dairy growth animal health research safety test data level drug cancer bovine study treated consumer claim science public use blood article labeling group increased processor
Topic #1: milk dairy price usda class milkweed cow producer cost federal dry cheese alfalfa pound heifer nation month order supply nonfat global january cheddar sale cwt dean million iii protein import
Topic #2: percent syrup cost gm soy wheat say premium debate higher corn soybean austria safety protein government ip house according want sugar european price sweetener quality process brazil high crop country
Topic #3: ge crop genetically engineered labeling biotech california organic page said soil regulation contamination wheat country report modified variety world grow export require ballot mexico import nation eu government usda press
Topic #4: creamery dairy butter testing consumer feed grain april test industry equipment meeting corn water month yogurt current grass recen