# Topic Modeling of Historical Organic Periodicals

In [1]:
from __future__ import print_function
import glob
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

## Parameters

In [4]:
n_features = 2000 # maximum number of features (bounded by vocabulary size)
n_components = 20 # number of topics

## Load periodicals dataset

First, the periodicals dataset is loaded.

In [5]:
docs_raw = []
input_dir = "../documents_processed"
paths = glob.glob("%s/*.txt" % input_dir)
for document_filename in paths:
    with open(document_filename) as document_file:
        docs_raw.append(document_file.read())
print(len(docs_raw))

251


## Convert to document-term matrix

Next, the raw documents are converted into document-term matrix, possibly as raw counts or TF-IDF form.

In [6]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
tf_vectorizer = CountVectorizer(stop_words = 'english',
                                max_features = n_features,
                                max_df = 0.95, 
                                min_df = 2)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
print(dtm_tf.shape)

(251, 2000)


In [7]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
print(dtm_tfidf.shape)

(251, 2000)


## Non-negative Matrix Factorization (NMF) model with Frobenius norm

In [8]:
nmf_frobenius = NMF(n_components=n_components, random_state=1,
                    alpha=.1, l1_ratio=.5)
nmf_frobenius.fit(dtm_tfidf)
pyLDAvis.sklearn.prepare(nmf_frobenius, dtm_tf, tf_vectorizer)

## NMF model with Kullback-Leibler (KL) divergence

In [9]:
nmf_kl = NMF(n_components=n_components, random_state=1,
             beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
             l1_ratio=.5)
nmf_kl.fit(dtm_tfidf)
pyLDAvis.sklearn.prepare(nmf_kl, dtm_tf, tf_vectorizer)

## Latent Dirichlet Allocation

In [10]:
# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
# for TF DTM (Document-Term Matrix)
lda_tf = LatentDirichletAllocation(n_components=n_components, random_state=0, learning_method="batch")
lda_tf.fit(dtm_tf)
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

In [11]:
# for TFIDF DTM (disabled)
# lda_tfidf = LatentDirichletAllocation(n_components=n_components, random_state=0, learning_method="batch")
# lda_tfidf.fit(dtm_tfidf)
# pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

### Using different Multi-Dimensional Scaling (MDS) functions

With `sklearn` installed, other MDS functions, such as MMDS and TSNE can be used for plotting if the default Principle Coordinate Analysis (PCoA) is not satisfactory.

In [12]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')

In [13]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')