In [1]:
%matplotlib inline


# Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation


This is an example of applying :class:`sklearn.decomposition.NMF` and
:class:`sklearn.decomposition.LatentDirichletAllocation` on a corpus
of documents and extract additive models of the topic structure of the
corpus.  The output is a list of topics, each represented as a list of
terms (weights are not shown).

Non-negative Matrix Factorization is applied with two different objective
functions: the Frobenius norm, and the generalized Kullback-Leibler divergence.
The latter is equivalent to Probabilistic Latent Semantic Indexing.

The default parameters (n_samples / n_features / n_components) should make
the example runnable in a couple of tens of seconds. You can try to
increase the dimensions of the problem, but be aware that the time
complexity is polynomial in NMF. In LDA, the time complexity is
proportional to (n_samples * iterations).




In [1]:
from os import listdir
from os.path import isfile, join
from os import environ
import os
import logging
import _pickle as pkl

from nltk.tokenize import TreebankWordTokenizer, word_tokenize
import re
from metadata import metadata
from time import time
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import langid
import numpy as np

from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary
import gensim

rng = np.random.RandomState(10102016)
np.random.seed(18101995)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = [logging.StreamHandler()]
import datetime

def dehyphenate(s):
    return s.replace('-\n','').lower()

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

def psave(ob, filename, timestamp = False):
    timenow = ""
    if timestamp:
        timenow = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())
    with(open("../models/" + filename + timenow,"wb")) as f:
        pkl.dump(ob,f)

INFO:summa.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English


In [2]:
#sent_text = nltk.sent_tokenize(text) # this gives us a list of sentences
with open( os.path.join(os.environ["AAN_DIR"],"papers_text/{0}".format("W04-0203.txt"))) as f:
    good_text  = f.read().replace('-\n','')
with open( os.path.join(os.environ["AAN_DIR"],"papers_text/{0}".format("W04-0207.txt"))) as f:
    good_text2  = f.read().replace('-\n','')
with open( os.path.join(os.environ["AAN_DIR"],"papers_text/{0}".format("W04-0205.txt"))) as f:
    bad_text  = f.read().replace('-\n','')
with open( os.path.join(os.environ["AAN_DIR"],"papers_text/{0}".format("J79-1011.txt"))) as f:
    bad_text2 = f.read().replace('-\n','')   
    

In [3]:
from metadata.metadata import ACL_metadata
acl = ACL_metadata()

In [4]:
acl.meta_df

Unnamed: 0_level_0,authors,genders,title,venue,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
E03-1001,"[Oard,Douglasw]",[Gender.male],Multilingual Access To Large Spoken Archives (...,EACL,2003
E03-1002,"[Henderson,Jamesb]",[Gender.male],Neural Network Probability Estimation For Broa...,EACL,2003
E03-1003,"[Burstein,Jill, Wolska,Magdalena]","[Gender.female, Gender.female]",Toward Evaluation Of Writing Style: Overly Rep...,EACL,2003
E03-1004,"[Cmejrek,Martin, Curin,Jan, Havelka,Jiri]","[Gender.male, Gender.male, Gender.male]",Czech-English Dependency Tree-Based Machine Tr...,EACL,2003
E03-1005,"[Bod,Rens]",[Gender.male],An Efficient Implementation Of A New DOP Model,EACL,2003
E03-1006,"[Smets,Martine, Gamon,Michael, Corstonoliver,S...","[Gender.female, Gender.male, Gender.male, Gend...",French Amalgam: A Quick Adaptation Of A Senten...,EACL,2003
E03-1007,"[Ueffing,Nicola, Ney,Hermann]","[Gender.female, Gender.male]",Using POS Information For SMT Into Morphologic...,EACL,2003
E03-1008,"[Steedman,Mark, Osborne,Miles, Sarkar,Anoop, C...","[Gender.male, Gender.male, Gender.male, Gender...",Bootstrapping Statistical Parsers From Small D...,EACL,2003
E03-1009,"[Clark,Alexander]",[Gender.male],Combining Distributional And Morphological Inf...,EACL,2003
E03-1010,"[Yasuda,Keiji, Sugaya,Fumiaki, Takezawa,Toshiy...","[Gender.male, Gender.male, Gender.male, Gender...",Automatic Evaluation For A Palpable Measure Of...,EACL,2003


In [16]:
len(acl.train_files)

23595

In [4]:
n_features = 100
n_components = 10
n_top_words = 20

In [5]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.85, 
                                input='filename',
                                min_df=7,
                                max_features=n_features,
                                stop_words='english',
                                token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z][a-zA-Z]+\b",
                                preprocessor = dehyphenate
                                #tokenizer=TreebankWordTokenizer().tokenize
                                )
t0 = time()
tf = tf_vectorizer.fit_transform(acl.train_files)
print("done in %0.3fs." % (time() - t0))
print()

psave(tf_vectorizer,"tf_vectozier" + str(n_features), True)
psave(tf,"tf" + str(n_features),True)

# transform sparse matrix into gensim corpus
corpus= gensim.matutils.Sparse2Corpus(tf, documents_columns=False)

# transform scikit vocabulary into gensim dictionary
vocabulary_gensim = {}
for key, val in tf_vectorizer.vocabulary_.items():
    vocabulary_gensim[val] = key
    
dic = Dictionary(tf_vectorizer.vocabulary)
    
psave(vocabulary_gensim,"vocabulary" + str(n_features),True)
psave(corpus,"corpus" + str(n_features),True)
psave(dic, "dic", True)

Extracting tf features for LDA...
done in 174.031s.



### Preprocessing for 20 newsgroups. 
For the 20 newsgroups dataset, we download the articles using
the scikit-learn interface, without removing headers, footers or quotes. We parse the text using
spaCy6
and convert all characters to lower case. Optionally, we then exclude stopwords using the list
of standard stopwords in Mallet. We then keep the 2000 words which appear in the largest number of
documents.7

### Preprocessing for NIPS.
For this dataset, we use the same processing as above, except that we use
a vocabulary size of 10,000 words, and we exclude all tokens which involve any symbols other than
alphabetic characters, and drop all tokens of length less than 3, in order to avoid ambiguous tokens
like section numbers and mathematical symbols.

In [7]:
lda_model = gensim.models.LdaModel(corpus, num_topics=100, id2word=vocabulary_gensim, passes=50)
psave(lda_model, "ldamodel-", True)

using symmetric alpha at 0.1
using symmetric eta at 0.01
using serial LDA version on this node
running online (single-pass) LDA training, 10 topics, 1 passes over the supplied corpus of 23595 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
PROGRESS: pass 0, at document #2000/23595
merging changes from 2000 documents into a model of 23595 documents
topic #3 (0.100): 0.050*"semantic" + 0.026*"lexical" + 0.023*"tree" + 0.023*"corpus" + 0.022*"word" + 0.020*"words" + 0.020*"model" + 0.020*"parsing" + 0.017*"proceedings" + 0.017*"table"
topic #4 (0.100): 0.037*"data" + 0.029*"semantic" + 0.026*"phrase" + 0.020*"systems" + 0.020*"model" + 0.020*"structure" + 0.018*"table" + 0.018*"evaluation" + 0.017*"task" + 0.017*"text"
topic #0 (0.100): 0.034*"feature" + 0.034*"text" + 0.027*"translation" + 0.024*"sentence" + 0.022*"features" + 0.020*"model" + 0.019*"grammar" + 0.018*"structure" + 0.01

topic #1 (0.100): 0.105*"translation" + 0.046*"english" + 0.040*"word" + 0.036*"model" + 0.036*"source" + 0.034*"machine" + 0.030*"data" + 0.030*"phrase" + 0.029*"target" + 0.027*"words"
topic #0 (0.100): 0.083*"text" + 0.071*"feature" + 0.065*"features" + 0.044*"knowledge" + 0.032*"linguistic" + 0.027*"level" + 0.024*"lexical" + 0.020*"structure" + 0.019*"analysis" + 0.018*"type"
topic #2 (0.100): 0.083*"word" + 0.070*"words" + 0.055*"similarity" + 0.053*"sense" + 0.034*"corpus" + 0.031*"context" + 0.022*"method" + 0.021*"target" + 0.021*"pairs" + 0.020*"lexical"
topic #8 (0.100): 0.080*"discourse" + 0.073*"relations" + 0.073*"relation" + 0.032*"text" + 0.027*"structure" + 0.024*"type" + 0.020*"sentence" + 0.019*"time" + 0.018*"knowledge" + 0.017*"corpus"
topic diff=0.055122, rho=0.353553
PROGRESS: pass 0, at document #18000/23595
merging changes from 2000 documents into a model of 23595 documents
topic #5 (0.100): 0.136*"word" + 0.092*"words" + 0.068*"speech" + 0.045*"model" + 0.025*

In [20]:
# with open("../models/ldamodel2017-11-04 03_49_52","rb") as f:
#     lda_model = pkl.load(f)

# with open("../models/corpus600002017-11-03 22_37_14","rb") as f:
#     corpus = pkl.load(f)

# with open("../models/dic2017-11-03 22_37_15","rb") as f:
#     dic = pkl.load(f)

In [8]:
lda_model.show_topics(num_topics=10, num_words=10)

[(0,
  '0.081*"text" + 0.071*"feature" + 0.069*"features" + 0.048*"knowledge" + 0.037*"linguistic" + 0.029*"level" + 0.026*"lexical" + 0.024*"type" + 0.022*"analysis" + 0.021*"domain"'),
 (1,
  '0.117*"translation" + 0.058*"english" + 0.040*"source" + 0.037*"machine" + 0.037*"word" + 0.037*"model" + 0.033*"phrase" + 0.033*"target" + 0.027*"data" + 0.024*"statistical"'),
 (2,
  '0.086*"word" + 0.074*"words" + 0.059*"similarity" + 0.054*"sense" + 0.036*"corpus" + 0.031*"context" + 0.029*"method" + 0.024*"pairs" + 0.021*"algorithm" + 0.020*"table"'),
 (3,
  '0.151*"semantic" + 0.057*"syntactic" + 0.053*"verb" + 0.046*"lexical" + 0.037*"noun" + 0.022*"parser" + 0.020*"terms" + 0.018*"corpus" + 0.018*"parsing" + 0.016*"proceedings"'),
 (4,
  '0.053*"data" + 0.043*"user" + 0.039*"document" + 0.036*"systems" + 0.036*"task" + 0.029*"text" + 0.029*"evaluation" + 0.024*"proceedings" + 0.022*"figure" + 0.022*"pages"'),
 (5,
  '0.149*"word" + 0.103*"words" + 0.069*"speech" + 0.050*"model" + 0.025*

In [9]:
import pyLDAvis.gensim
import gensim
pyLDAvis.enable_notebook()


In [None]:
%matplotlib

data = pyLDAvis.gensim.prepare(lda_model, corpus, dic)


Using matplotlib backend: Qt5Agg
