In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import *
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [4]:
#filename_list = os.listdir()[0:49]

# Reduce to 25 files because 50 was too memory-intensive
filename_list = os.listdir()[0:24]

In [5]:
# Just making sure these file names are correct:
filename_list[0:5]

# Looks good!

['0301116', '0304232', '0303017', '0303225', '0302131']

Now we can read in all 50 files from the `filename_list`.

In [6]:
corpus = []

for i in range(len(filename_list)):
    
    filename = filename_list[i]
    
    with open(filename, 'rb') as file:
        for line in file:
            line = line.strip()
            corpus.append(line)

In [7]:
len(corpus)

33780

Is this a reasonable number of lines in our corpus? As a sanity check of whether this length makes sense, we can repeat the reading-in-and-appending process with just one file for comparison:

In [39]:
corpus = []

with open('0301180', 'rb') as file:
        for line in file:
            line = line.strip()
            corpus.append(line)

(We use `'rb'` here because opening the file with the default mode `'r'` fails due to a `UnicodeDecodeError`. Note also that it is not necessary to call `file.close()` since using `with` closes the file automatically.)

In [34]:
len(corpus)

1710

Great! This is clearly smaller than reading in the entire `filename_list`, so the process worked correctly. We'll go ahead and redo reading in the `filename_list` below.

In [40]:
corpus = []

for i in range(len(filename_list)):
    
    filename = filename_list[i]
    
    with open(filename, 'rb') as file:
        for line in file:
            line = line.strip()
            corpus.append(line)

Now we can turn our corpus into a matrix of TF-IDF features using `sklearn`'s `TfidfVectorizer()`.

In [56]:
vectorizer = TfidfVectorizer(decode_error='ignore', stop_words='english', max_df=0.9, max_features = 10000)

X = vectorizer.fit_transform(corpus)
X

<33780x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 124867 stored elements in Compressed Sparse Row format>

Let's take a look at the vocabulary that was learned by the vectorizer.

In [37]:
vectorizer.vocabulary_

{'latex': 6010,
 'file': 4576,
 'paper': 6986,
 'documentstyle': 3853,
 'preprint': 7269,
 'eqsecnum': 4246,
 'aps': 2282,
 'epsfig': 4197,
 'revtex': 7920,
 '12pt': 527,
 'article': 2314,
 'tightenlines': 9155,
 'twocolumn': 9344,
 'def': 3531,
 'tt': 9314,
 'backslash': 2532,
 'bibtex': 2687,
 'rm': 7977,
 'sc': 8154,
 'ib': 5304,
 'tex': 9079,
 'newcommand': 6703,
 'cs': 3360,
 'ch': 2949,
 'beq': 2624,
 'begin': 2597,
 'equation': 4253,
 'eeq': 4035,
 'end': 4132,
 'bdm': 2590,
 'displaymath': 3777,
 'edm': 4029,
 'beqa': 2625,
 'eqnarray': 4241,
 'eeqa': 4036,
 'beqab': 2626,
 'eeqab': 4037,
 'partial': 7008,
 'dlr': 3835,
 'stackrel': 8635,
 'leftrightarrow': 6051,
 'lra': 6237,
 'ms': 6559,
 'm_': 6255,
 'symbolfootnote': 8936,
 'renewcommand': 7814,
 'thefootnote': 9107,
 'fnsymbol': 4636,
 'footnote': 4652,
 'alphfootnote': 2131,
 'setcounter': 8297,
 'sevenrm': 8308,
 'alph': 2120,
 'theequation': 9105,
 'arabic': 2285,
 'section': 8238,
 'nn': 6723,
 'nonumber': 6745,
 'make

As we can see, a -lot- of the vocabulary learned by the vectorizer has to do with LaTeX formatting for these papers. 

Just to show what the vectorizer is picking up, here's a glance of the first 25 lines of the first document:

In [49]:
corpus[1:25]

[b'',
 b'%******************LATEX FILE OF THE PAPER***********************',
 b'%',
 b'%',
 b'',
 b'%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%',
 b'\\documentstyle[preprint,eqsecnum,aps,epsfig]{revtex}',
 b'%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%',
 b'',
 b'%\\documentstyle[12pt,epsfig]{article}',
 b'',
 b'%%%%%%%%%%%%%%%%',
 b'\\tightenlines',
 b'%%%%%%%%%%%%%%%',
 b'',
 b'',
 b'%\\documentstyle[eqsecnum,twocolumn,aps]{revtex}',
 b'\\def\\btt#1{{\\tt$\\backslash$#1}}',
 b'\\def\\BibTeX{\\rm B{\\sc ib}\\TeX}',
 b'',
 b"\\newcommand{\\cs}{\\'{c}}",
 b'\\newcommand{\\ch}{\\v{c}}',
 b'\\newcommand{\\s}{\\v{s}}',
 b'']

In [61]:
# Initialize NMF
nmf_model = NMF(n_components = 50, init = 'random')

# Create variable to make it easy to retrieve topics
idx_to_word = np.array(vectorizer.get_feature_names())

In [62]:
nmf_model.fit(X)

NMF(alpha=0.0, beta_loss='frobenius', init='random', l1_ratio=0.0,
  max_iter=200, n_components=50, random_state=None, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)

In [63]:
nmf_components = nmf_model.components_

In [64]:
for i, topic in enumerate(nmf_components):
    print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in idx_to_word[topic.argsort()[-10:]]])))

Topic 1: stresstail,fluctuations,constitutive,tij,rho,relation,stress,energy,density,eq
Topic 2: sin,qquad,infty,enthalpy,quad,ldots,sim,12,sqrt,frac
Topic 3: mp,ll,cccc,clcr,eqno,ccl,rcl,quad,cc,array
Topic 4: end,state,differential,motion,renewcommand,dirac,theequation,arabic,setcounter,equation
Topic 5: kl,mathbb,ket,mbox,volume,half,rm,rangle,langle,newcommand
Topic 6: tau_,phi_,oint,oscillations,qr,stackrel,ta,approx,quad,phi
Topic 7: sbody12,rbrace,vec,lbrace,q_i,ab,half,exp,langle,left
Topic 8: limit,matter,inflaton,gauge,spinor,theories,equations,strength,scalar,field
Topic 9: s_,eq3,anfx,rw2,ea,ba,beqa,eeqa,newcommand,eqnarray
Topic 10: open,super,effective,supersymmetric,yang,mills,superstring,gauge,string,theory
Topic 11: iota,holomorphic,int_,bundle,supersymmetric,infty,qh,tr,cp,cal
Topic 12: f_r,mu,forall,d_0,limr,int,eqq,psio,qr,ee
Topic 13: currentone,branetransl,1stal,dw,dbi,ncsolution,id,solution,eqn,bea
Topic 14: 10,fig,5cm,width,epsfig,case,eps,tb,footnote,figure
Top

Sadly, these topics are not very coherent. We can spot a few that do seem to form a cohesive group, such as topic 14, which corresponds to figure-and-display-related LaTeX terms, and 23, which is similar. Quite a few individual topics seem to correspond to symbols and Greek letters. One question I have following this is why some of these would not simply be in the same category. It's possible that 50 components is too many, considering we've only read in 25 papers, and that we need to collapse the 50 NMF components into fewer (maybe, for example, 10). 

Let's try that next.

In [65]:
nmf_model_10 = NMF(n_components = 10, solver = 'mu')
nmf_model_10.fit(X)
nmf_components_10 = nmf_model_10.components_

In [66]:
for i, topic in enumerate(nmf_components_10):
    print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in idx_to_word[topic.argsort()[-10:]]])))

Topic 1: displaymath,thebibliography,document,center,setcounter,figure,array,split,equation,end
Topic 2: align,abstract,displaymath,document,center,eq,split,label,equation,begin
Topic 3: sigma,tilde,ab,gamma,nonumber,nu,array,mu,left,right
Topic 4: boostedmetric,sourceaction,minipage,eqn,thebibliography,figure,array,begin,end,align
Topic 5: bf,href,abs,http,org,tt,arxiv,citation,hep,th
Topic 6: gamma,sigma,cal,nu,p_,mu,bar,dot,beta,alpha
Topic 7: partial,delta,mbox,gamma,lambda,omega,rm,bf,cal,def
Topic 8: operator,a_,forall,int,limr,eqq,psio,qr,psi,ee
Topic 9: pm,tilde,partial,eta,phi,bar,sqrt,psi,pi,frac
Topic 10: center,thebibliography,bea,figure,newcommand,eea,array,begin,end,eqnarray


Hmm. Reducing the number of components (and experimenting with changing the solver) did not seem to help. Now our components are even less informative, as they are now overtaken completely by the LaTeX specifications.

Let's turn to LDA and see if it can do any better here.

In [59]:
lda_model = LatentDirichletAllocation(max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

lda_model.fit(X)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [60]:
for i, topic in enumerate(lda_model.components_):
    print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in idx_to_word[topic.argsort()[-20:]]])))

Topic 1: ref,following,section,spinors,possible,fluctuations,relations,does,limit,parameters,relation,topological,inflation,displaymath,condition,form,model,cite,boundary,bibitem
Topic 2: orbifold,formula,study,introduction,nc,vskip,gamma_i,s_2,usepackage,symmetric,cq,24,5mm,vspace,1mm,hspace,l_3,group,bigskip,section
Topic 3: delta,infty,theta,tau,sigma,gamma,nonumber,partial,chi,nu,psi,bar,cal,pi,tilde,phi,mu,left,right,frac
Topic 4: invariant,let,wave,symmetry,scalar,background,potential,space,equations,supersymmetric,times,gauge,eq,string,time,branes,field,ref,theory,label
Topic 5: tr,modes,psio,parameter,dx,terms,spectrum,small,deformation,su,corresponding,matter,ap,half,wedge,order,defined,density,charge,zero
Topic 6: integral,mode,fig,spacetime,ea,scale,d_0,bea,dirac,algebra,eea,figure,vec,ee,array,align,eqnarray,begin,end,equation
Topic 7: given,term,langle,consider,dimensions,theory,epsilon,omega,states,ket,state,energy,brane,non,overline,fields,dot,beta,def,alpha
Topic 8: mpc

Interesting! LDA gives us at least a few topics that are actually about physics, such as 4, 5, 7, and 8. The rest are still mostly LaTeX.