In [11]:
%matplotlib inline


# Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation


This is an example of applying :class:`sklearn.decomposition.NMF` and
:class:`sklearn.decomposition.LatentDirichletAllocation` on a corpus
of documents and extract additive models of the topic structure of the
corpus.  The output is a list of topics, each represented as a list of
terms (weights are not shown).

Non-negative Matrix Factorization is applied with two different objective
functions: the Frobenius norm, and the generalized Kullback-Leibler divergence.
The latter is equivalent to Probabilistic Latent Semantic Indexing.

The default parameters (n_samples / n_features / n_components) should make
the example runnable in a couple of tens of seconds. You can try to
increase the dimensions of the problem, but be aware that the time
complexity is polynomial in NMF. In LDA, the time complexity is
proportional to (n_samples * iterations).




In [12]:
from os import listdir
from os.path import isfile, join
from os import environ
import os
import logging
import _pickle as pkl

from nltk.tokenize import TreebankWordTokenizer, word_tokenize
import re
from metadata import metadata
from time import time
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import langid
import numpy as np

from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary
import gensim

rng = np.random.RandomState(10102016)
np.random.seed(18101995)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = [logging.StreamHandler()]
import datetime

def dehyphenate(s):
    return s.replace('-\n','').lower()

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

def psave(ob, filename, timestamp = False):
    timenow = ""
    if timestamp:
        timenow = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())
    with(open("../models/" + filename + timenow,"wb")) as f:
        pkl.dump(ob,f)

In [13]:
#sent_text = nltk.sent_tokenize(text) # this gives us a list of sentences
with open( os.path.join(os.environ["AAN_DIR"],"papers_text/{0}".format("W04-0203.txt"))) as f:
    good_text  = f.read().replace('-\n','')
with open( os.path.join(os.environ["AAN_DIR"],"papers_text/{0}".format("W04-0207.txt"))) as f:
    good_text2  = f.read().replace('-\n','')
with open( os.path.join(os.environ["AAN_DIR"],"papers_text/{0}".format("W04-0205.txt"))) as f:
    bad_text  = f.read().replace('-\n','')
with open( os.path.join(os.environ["AAN_DIR"],"papers_text/{0}".format("J79-1011.txt"))) as f:
    bad_text2 = f.read().replace('-\n','')   
    

In [14]:
from metadata.metadata import ACL_metadata
acl = ACL_metadata()

In [15]:
acl.meta_df

Unnamed: 0_level_0,authors,genders,title,venue,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
E03-1001,"[Oard,Douglasw]",[Gender.male],Multilingual Access To Large Spoken Archives (...,EACL,2003
E03-1002,"[Henderson,Jamesb]",[Gender.male],Neural Network Probability Estimation For Broa...,EACL,2003
E03-1003,"[Burstein,Jill, Wolska,Magdalena]","[Gender.female, Gender.female]",Toward Evaluation Of Writing Style: Overly Rep...,EACL,2003
E03-1004,"[Cmejrek,Martin, Curin,Jan, Havelka,Jiri]","[Gender.male, Gender.male, Gender.male]",Czech-English Dependency Tree-Based Machine Tr...,EACL,2003
E03-1005,"[Bod,Rens]",[Gender.male],An Efficient Implementation Of A New DOP Model,EACL,2003
E03-1006,"[Smets,Martine, Gamon,Michael, Corstonoliver,S...","[Gender.female, Gender.male, Gender.male, Gend...",French Amalgam: A Quick Adaptation Of A Senten...,EACL,2003
E03-1007,"[Ueffing,Nicola, Ney,Hermann]","[Gender.female, Gender.male]",Using POS Information For SMT Into Morphologic...,EACL,2003
E03-1008,"[Steedman,Mark, Osborne,Miles, Sarkar,Anoop, C...","[Gender.male, Gender.male, Gender.male, Gender...",Bootstrapping Statistical Parsers From Small D...,EACL,2003
E03-1009,"[Clark,Alexander]",[Gender.male],Combining Distributional And Morphological Inf...,EACL,2003
E03-1010,"[Yasuda,Keiji, Sugaya,Fumiaki, Takezawa,Toshiy...","[Gender.male, Gender.male, Gender.male, Gender...",Automatic Evaluation For A Palpable Measure Of...,EACL,2003


In [16]:
len(acl.train_files)

23595

In [17]:
n_features = 60000
n_components = 10
n_top_words = 20

In [None]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.85, 
                                input='filename',
                                min_df=7,
                                max_features=n_features,
                                stop_words='english',
                                token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z]+\b",
                                preprocessor = dehyphenate
                                #tokenizer=TreebankWordTokenizer().tokenize
                                )
t0 = time()
tf = tf_vectorizer.fit_transform(acl.train_files)
print("done in %0.3fs." % (time() - t0))
print()

psave(tf_vectorizer,"tf_vectozier" + str(n_features), True)
psave(tf,"tf" + str(n_features),True)

# transform sparse matrix into gensim corpus
corpus= gensim.matutils.Sparse2Corpus(tf, documents_columns=False)

# transform scikit vocabulary into gensim dictionary
vocabulary_gensim = {}
for key, val in tf_vectorizer.vocabulary_.items():
    vocabulary_gensim[val] = key
    
dic = Dictionary(tf_vectorizer.vocabulary)
    
psave(vocabulary_gensim,"vocabulary" + str(n_features),True)
psave(corpus,"corpus" + str(n_features),True)
psave(dic, "dic", True)

Extracting tf features for LDA...
done in 64.694s.



In [None]:
lda_model = gensim.models.LdaModel(corpus, num_topics=100, id2word=vocabulary_gensim, passes=50)
psave(lda_model, "ldamodel-", True)

using symmetric alpha at 0.01
using symmetric eta at 1.66666666667e-05
using serial LDA version on this node


In [None]:
# with open("../models/ldamodel2017-11-04 03_49_52","rb") as f:
#     lda_model = pkl.load(f)

# with open("../models/corpus600002017-11-03 22_37_14","rb") as f:
#     corpus = pkl.load(f)

# with open("../models/dic2017-11-03 22_37_15","rb") as f:
#     dic = pkl.load(f)

In [None]:
lda_model.show_topics(num_topics=10, num_words=10)

In [None]:
import pyLDAvis.gensim
import gensim
pyLDAvis.enable_notebook()


In [None]:
%matplotlib
try:
    data = pyLDAvis.gensim.prepare(lda_model, corpus, dic)
except e:
    print(e)