In [9]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import *
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [27]:
filename_list = os.listdir()[0:300]

In [11]:
# Just making sure these file names are correct:
filename_list[0:5]

# Looks good!

['0301116', '0304232', '0303017', '0303225', '0302131']

Now we can read in all the files from the `filename_list`.

In [28]:
corpus = []

for i in range(len(filename_list)):
    
    filename = filename_list[i]
    
    with open(filename, 'r', errors='ignore') as file:
            file_contents = file.read()
    
    # Add document to corpus
    corpus.append(file_contents)

In [30]:
len(corpus)

300

We have exactly 300 documents to work with. Now we can turn our corpus into a matrix of TF-IDF features using `sklearn`'s `TfidfVectorizer()`.

In [31]:
vectorizer = TfidfVectorizer(decode_error='ignore', stop_words='english', max_df=0.9, max_features = 10000)

X = vectorizer.fit_transform(corpus)
X

<300x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 286922 stored elements in Compressed Sparse Row format>

Let's take a look at the vocabulary that was learned by the vectorizer.

In [34]:
vectorizer.vocabulary_

{'latex': 5619,
 'file': 4047,
 'paper': 6902,
 'documentstyle': 3345,
 'preprint': 7259,
 'eqsecnum': 3722,
 'aps': 1542,
 'epsfig': 3685,
 'revtex': 7926,
 '12pt': 218,
 'article': 1578,
 'twocolumn': 9322,
 'def': 3033,
 'tt': 9295,
 'backslash': 1723,
 'bibtex': 1896,
 'rm': 7980,
 'sc': 8147,
 'ib': 4880,
 'tex': 9053,
 'newcommand': 6431,
 'cs': 2836,
 'ch': 2270,
 'beq': 1834,
 'eeq': 3521,
 'displaymath': 3294,
 'beqa': 1835,
 'eqnarray': 3714,
 'eeqa': 3522,
 'partial': 6937,
 'stackrel': 8615,
 'leftrightarrow': 5670,
 'lra': 5856,
 'ms': 6253,
 'm_': 5883,
 'renewcommand': 7813,
 'thefootnote': 9079,
 'fnsymbol': 4107,
 'footnote': 4130,
 'setcounter': 8272,
 'alph': 1379,
 'theequation': 9078,
 'arabic': 1544,
 'nn': 6479,
 'nonumber': 6519,
 'hbox': 4659,
 '0pt': 165,
 'hss': 4807,
 'original': 6780,
 'se': 8204,
 'sa': 8093,
 'newcounter': 6433,
 'saveeqn': 8134,
 'value': 9501,
 'stepcounter': 8662,
 'mbox': 6053,
 'draft': 3395,
 'flushright': 4101,
 '02': 48,
 '03': 11

As we can see, a -lot- of the vocabulary learned by the vectorizer has to do with LaTeX formatting for these papers. 

In [35]:
# Initialize NMF
nmf_model = NMF(n_components = 50, solver = 'mu')

# Create variable to make it easy to retrieve topics
idx_to_word = np.array(vectorizer.get_feature_names())

In [36]:
nmf_model.fit(X)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=50, random_state=None, shuffle=False, solver='mu',
  tol=0.0001, verbose=0)

In [37]:
nmf_components = nmf_model.components_

In [45]:
for i, topic in enumerate(nmf_components):
    print("Topic {}: {}".format(i + 1, ", ".join([str(x) for x in idx_to_word[topic.argsort()[-10:]]])))

Topic 1: label, a_, alpha, rho, eqnarray, gauge, partial_, frac, nu, mu
Topic 2: gauge, phi, nolimits, cr, string, hbox, hep, rm, lref, def
Topic 3: columncolor, sigma, pm, mid, cr, delta, partial_, wznw, chi, bar
Topic 4: quad, pm, tau, _1, ast, hbar, label, array, frac, psi
Topic 5: quad, phi_, sitter, ref, gamma, ee, fuzzy, 2k, tilde, hat
Topic 6: temperature, xi, beta, noncommutative, eff, bm, ref, infty, pi, frac
Topic 7: gamma_1, gamma, eqn, psi, gamma_, cr, cdot, pm, underline, vec
Topic 8: spacetime, ref, frac, metric, hole, ds, r_, black, ell, horizon
Topic 9: radim, wedge, bibitem, star, theta, cite, label, ref, ee, eq
Topic 10: alpha, epsilon, pi, cal, arxiv, eqalign, superpotential, cr, lref, eqn
Topic 11: action, supergravity, d3, sqrt, label, gamma, bulk, frac, branes, brane
Topic 12: radiation, cite, born, infeld, dot, bibitem, universe, density, rho_, inflation
Topic 13: textrm, lambda, waldmann, frac, textbf, star, mathbb, emph, newblock, mathcal
Topic 14: rangle, spin

Sadly, these topics are not very coherent. We can spot a few that do seem to form a cohesive group, such as topic 14, which corresponds to figure-and-display-related LaTeX terms, and 23, which is similar. Quite a few individual topics seem to correspond to symbols and Greek letters. One question I have following this is why some of these would not simply be in the same category. It's possible that 50 components is too many, considering we've only read in 25 papers, and that we need to collapse the 50 NMF components into fewer (maybe, for example, 10). 

Let's try that next.

In [39]:
nmf_model_10 = NMF(n_components = 10, solver = 'mu')
nmf_model_10.fit(X)
nmf_components_10 = nmf_model_10.components_

In [44]:
for i, topic in enumerate(nmf_components_10):
    print("Topic {}: {}".format(i + 1, ", ".join([str(x) for x in idx_to_word[topic.argsort()[-10:]]])))

Topic 1: alpha, hep, phi, bibitem, brane, ref, label, mu, cite, frac
Topic 2: arxiv, hbox, gauge, branes, rm, citation, lref, string, hep, def
Topic 3: f_, tilde, ab, psi, a_, partial_, bar, alpha, nu, mu
Topic 4: tau, array, sigma, gamma, tilde, pm, frac, theta, psi, bar
Topic 5: ref, quad, gamma, sphere, 2k, ee, fuzzy, tilde, big, hat
Topic 6: p_, varphi, noncommutative, infty, ref, beta, prime, lambda, pi, frac
Topic 7: a_, big, cdot, gamma_, cr, eqn, pm, underline, psi, vec
Topic 8: qc, area, entropy, ell, r_, holes, ee, horizon, hole, black
Topic 9: bibitem, eq, newcommand, label, cite, rm, ee, ref, mathcal, cal
Topic 10: wedge, superpotential, cr, cal, lambda, big, lref, beta, alpha, eqn


Hmm. Reducing the number of components (and experimenting with changing the solver) did not seem to help. Now our components are even less informative, as they are now overtaken completely by the LaTeX specifications.

Let's turn to LDA and see if it can do any better here.

In [41]:
lda_model = LatentDirichletAllocation(max_iter = 5,
                                learning_method = 'online',
                                learning_offset = 50.,
                                random_state = 0)

lda_model.fit(X)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [43]:
for i, topic in enumerate(lda_model.components_):
    print("Topic {}: {}".format(i + 1, ", ".join([str(x) for x in idx_to_word[topic.argsort()[-20:]]])))

Topic 1: martelli, gather, varepsilon, fluids, frac, phi, financial, lambda, bachas, polarizations, vardi, mu, citation, took, need, bibitem, massive, balasubramanian, rev, eq
Topic 2: footnotesize, suffers, mathcal, 562, 0211198, kl, psi_l, collaboration, 64, 353, 20pt, 570, label, m_d, chebyshev, zf, frac, wetterich, d2, mu
Topic 3: nu, bcirc_, fluxes, irp, 0i, subject, trap, cite, brane, cal, succeed, l_, ref, mu, def, rm, label, bar, theta, frac
Topic 4: mu, observe, cascade, tri, relationships, statistics, lambda_m, favor, oint_, n_c, antighost, 387, electronic, partial, enjoys, bdd, quantisation, bootstrap, alice, evolving
Topic 5: xm, schemes, doteqdot, worldsheet, 9711200, 633, distinction, u_r, 9905221, dissipation, symmetrized, 543, louis, eqno, d19, limited, metr, serve, sign, matrix
Topic 6: uni, bea, i_0, holography, tb, forms, 2001af, 8cm, renormalizability, n_g, lattice, gh, 9711200, textit, tqft, toroidal, nonumber, frac, mu, prd
Topic 7: ee, psi, string, pi, lambda, rm