# Topic Modeling of Historical Organic Periodicals

In [1]:
from __future__ import print_function
import glob
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

## Parameters

In [4]:
n_features = 2000 # maximum number of features (bounded by vocabulary size)
n_components = 20 # number of topics

## Load periodicals dataset

First, the periodicals dataset is loaded.

In [5]:
from perysis import load_documents
input_dir = "../documents_processed"
documents = load_documents.load_documents(input_dir)
data = [document.data for document in documents]

## Convert to document-term matrix

Next, the raw documents are converted into document-term matrix, possibly as raw counts or TF-IDF form.

In [6]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
tf_vectorizer = CountVectorizer(stop_words = 'english',
                                max_features = n_features,
                                max_df = 0.95, 
                                min_df = 2)
dtm_tf = tf_vectorizer.fit_transform(data)
print(dtm_tf.shape)

(251, 2000)


In [None]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(data)
print(dtm_tfidf.shape)

## Non-negative Matrix Factorization (NMF) model with Frobenius norm

In [None]:
nmf_frobenius = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.3)
W = nmf_frobenius.fit_transform(dtm_tfidf)
print([idx for idx in range(n_components) if nmf_frobenius.components_[idx].sum() < 1e-5])
pyLDAvis.sklearn.prepare(nmf_frobenius, dtm_tfidf, tfidf_vectorizer)

## NMF model with Kullback-Leibler (KL) divergence

In [None]:
nmf_kl = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=100, alpha=.1, l1_ratio=.3)
W = nmf_kl.fit_transform(dtm_tfidf)
print([idx for idx in range(n_components) if nmf_frobenius.components_[idx].sum() < 1e-5])
pyLDAvis.sklearn.prepare(nmf_kl, dtm_tfidf, tfidf_vectorizer)

## Latent Dirichlet Allocation

In [None]:
# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
# for TF DTM (Document-Term Matrix)
lda_tf = LatentDirichletAllocation(n_components=n_components, max_iter=100, random_state=1, learning_method="batch")
W = lda_tf.fit_transform(dtm_tf)
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

In [None]:
# for TFIDF DTM (disabled)
# lda_tfidf = LatentDirichletAllocation(n_components=n_components, random_state=0, learning_method="batch")
# lda_tfidf.fit(dtm_tfidf)
# pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

### Using different Multi-Dimensional Scaling (MDS) functions

With `sklearn` installed, other MDS functions, such as MMDS and TSNE can be used for plotting if the default Principle Coordinate Analysis (PCoA) is not satisfactory.

In [None]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')

In [None]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')