In [1]:
%matplotlib inline


# Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation


This is an example of applying :class:`sklearn.decomposition.NMF` and
:class:`sklearn.decomposition.LatentDirichletAllocation` on a corpus
of documents and extract additive models of the topic structure of the
corpus.  The output is a list of topics, each represented as a list of
terms (weights are not shown).

Non-negative Matrix Factorization is applied with two different objective
functions: the Frobenius norm, and the generalized Kullback-Leibler divergence.
The latter is equivalent to Probabilistic Latent Semantic Indexing.

The default parameters (n_samples / n_features / n_components) should make
the example runnable in a couple of tens of seconds. You can try to
increase the dimensions of the problem, but be aware that the time
complexity is polynomial in NMF. In LDA, the time complexity is
proportional to (n_samples * iterations).




In [40]:
from os import listdir
from os.path import isfile, join
from os import environ
import os
import logging
import _pickle as pkl

from nltk.tokenize import TreebankWordTokenizer, word_tokenize
import re
import indexes
from time import time
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import langid
import numpy as np

from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary
import gensim

rng = np.random.RandomState(10102016)
np.random.seed(18101995)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = [logging.StreamHandler()]
import datetime

def dehyphenate(s):
    return s.replace('-\n','').lower()

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

def psave(ob, filename, timestamp = False):
    timenow = ""
    if timestamp:
        timenow = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())
    with(open("models/" + filename + timenow,"wb")) as f:
        pkl.dump(ob,f)

In [2]:
#sent_text = nltk.sent_tokenize(text) # this gives us a list of sentences
with open( os.path.join(os.environ["AAN_DIR"],"papers_text/{0}".format("W04-0203.txt"))) as f:
    good_text  = f.read().replace('-\n','')
with open( os.path.join(os.environ["AAN_DIR"],"papers_text/{0}".format("W04-0207.txt"))) as f:
    good_text2  = f.read().replace('-\n','')
with open( os.path.join(os.environ["AAN_DIR"],"papers_text/{0}".format("W04-0205.txt"))) as f:
    bad_text  = f.read().replace('-\n','')
with open( os.path.join(os.environ["AAN_DIR"],"papers_text/{0}".format("J79-1011.txt"))) as f:
    bad_text2 = f.read().replace('-\n','')   
    

In [3]:
from indexes import ACL_fulltext
acl = ACL_fulltext()

In [4]:
acl.df

Unnamed: 0_level_0,authors,genders,title,venue,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
E03-1062,"[Piwek, Paul]",[Gender.male],A Flexible Pragmatics-Driven Language Generato...,EACL,2003
E06-1001,"[McConville, Mark]",[Gender.male],Inheritance And The CCG Lexicon,EACL,2006
E06-1002,"[Bunescu, Razvan C., Pa&scedil;ca, Marius]","[Gender.male, Gender.male]",Using Encyclopedic Knowledge For Named Entity ...,EACL,2006
E06-1003,"[Tanev, Hristo, Magnini, Bernardo]","[Gender.male, Gender.male]",Weakly Supervised Approaches For Ontology Popu...,EACL,2006
E06-1004,"[Udupa, Raghavendra, Maji, Hemanta K.]","[Gender.male, Gender.male]",Computational Complexity Of Statistical Machin...,EACL,2006
E06-1005,"[Matusov, Evgeny, Ueffing, Nicola, Ney, Hermann]","[Gender.male, Gender.female, Gender.male]",Computing Consensus Translation For Multiple M...,EACL,2006
E06-1006,"[Yang, Mei, Kirchhoff, Katrin]","[Gender.female, Gender.female]",Phrase-Based Backoff Models For Machine Transl...,EACL,2006
E06-1007,"[M&uuml;ller, Christof E.]",[Gender.male],Automatic Detection Of Nonreferential It In Sp...,EACL,2006
E06-1008,"[Jonson, Rebecca]",[Gender.female],Generating Statistical Language Models From In...,EACL,2006
E06-1009,"[Demberg, Vera, Moore, Johanna D.]","[Gender.female, Gender.female]",Information Presentation In Spoken Dialogue Sy...,EACL,2006


In [5]:
len(acl.train_files)

22405

In [6]:
n_features = 60000
n_components = 10
n_top_words = 20

In [43]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.85, 
                                input='filename',
                                min_df=7,
                                max_features=n_features,
                                stop_words='english',
                                token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z]+\b",
                                preprocessor = dehyphenate
                                #tokenizer=TreebankWordTokenizer().tokenize
                                )
t0 = time()
tf = tf_vectorizer.fit_transform(acl.train_files)
print("done in %0.3fs." % (time() - t0))
print()

psave(tf_vectorizer,"tf_vectozier" + str(n_features), True)
psave(tf,"tf" + str(n_features),True)

# transform sparse matrix into gensim corpus
corpus= gensim.matutils.Sparse2Corpus(tf, documents_columns=False)

# transform scikit vocabulary into gensim dictionary
vocabulary_gensim = {}
for key, val in tf_vectorizer.vocabulary_.items():
    vocabulary_gensim[val] = key
    
dic = Dictionary(tf_vectorizer.vocabulary)
    
psave(vocabulary,"vocabulary" + str(n_features),True)
psave(corpus,"corpus" + str(n_features),True)
psave(dic, "dic", True)

Extracting tf features for LDA...
done in 52.216s.



In [45]:
%time lda_model = gensim.models.LdaModel(corpus, num_topics=100, id2word=vocabulary_gensim, passes=50)
psave(lda_model, "ldamodel2", True)

using symmetric alpha at 0.01
using symmetric eta at 1.66666666667e-05
using serial LDA version on this node
running online (multi-pass) LDA training, 100 topics, 50 passes over the supplied corpus of 22405 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
PROGRESS: pass 0, at document #2000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #72 (0.010): 0.009*"word" + 0.009*"model" + 0.008*"translation" + 0.007*"words" + 0.005*"training" + 0.004*"models" + 0.004*"features" + 0.004*"sentence" + 0.004*"table" + 0.004*"proceedings"
topic #6 (0.010): 0.004*"words" + 0.004*"feature" + 0.004*"sentence" + 0.004*"corpus" + 0.004*"model" + 0.004*"domain" + 0.004*"word" + 0.004*"table" + 0.004*"translation" + 0.003*"english"
topic #45 (0.010): 0.004*"model" + 0.004*"word" + 0.004*"features" + 0.004*"words" + 0.004*"text" + 0.004*"sentence" + 0.003*"feature" + 0.003

merging changes from 2000 documents into a model of 22405 documents
topic #37 (0.010): 0.032*"evaluation" + 0.019*"metrics" + 0.015*"metric" + 0.015*"human" + 0.014*"systems" + 0.012*"scores" + 0.011*"task" + 0.010*"correlation" + 0.009*"automatic" + 0.009*"reference"
topic #60 (0.010): 0.019*"model" + 0.012*"probability" + 0.009*"probabilities" + 0.008*"training" + 0.007*"pp" + 0.007*"models" + 0.006*"corpus" + 0.006*"word" + 0.006*"sentence" + 0.005*"context"
topic #55 (0.010): 0.047*"error" + 0.041*"errors" + 0.021*"correction" + 0.012*"correct" + 0.010*"sentence" + 0.008*"word" + 0.006*"corrections" + 0.006*"sentences" + 0.006*"spelling" + 0.006*"corpus"
topic #69 (0.010): 0.052*"dependency" + 0.021*"parsing" + 0.011*"graph" + 0.010*"projective" + 0.008*"nivre" + 0.007*"parser" + 0.006*"non" + 0.006*"edge" + 0.006*"tree" + 0.006*"order"
topic #76 (0.010): 0.059*"sense" + 0.024*"senses" + 0.021*"word" + 0.019*"disambiguation" + 0.013*"semantic" + 0.013*"wsd" + 0.013*"words" + 0.007*

topic #69 (0.010): 0.082*"dependency" + 0.032*"parsing" + 0.011*"parser" + 0.009*"nivre" + 0.009*"projective" + 0.008*"head" + 0.008*"pages" + 0.007*"graph" + 0.007*"dependencies" + 0.007*"proceedings"
topic #80 (0.010): 0.050*"grammar" + 0.011*"grammars" + 0.011*"semantic" + 0.011*"unification" + 0.010*"hpsg" + 0.010*"structure" + 0.010*"lexical" + 0.009*"feature" + 0.008*"coverage" + 0.007*"structures"
topic #67 (0.010): 0.029*"text" + 0.011*"structure" + 0.008*"propositions" + 0.006*"knowledge" + 0.006*"rhetorical" + 0.005*"generation" + 0.005*"proposition" + 0.004*"texts" + 0.004*"discourse" + 0.004*"figure"
topic #37 (0.010): 0.040*"evaluation" + 0.020*"human" + 0.017*"metrics" + 0.015*"scores" + 0.015*"systems" + 0.014*"metric" + 0.011*"task" + 0.011*"correlation" + 0.011*"score" + 0.011*"reference"
topic diff=1.884609, rho=0.275215
PROGRESS: pass 1, at document #8000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #45 (0.010): 0.048*"generation" +

topic #47 (0.010): 0.032*"model" + 0.017*"models" + 0.017*"hmm" + 0.015*"sequence" + 0.011*"em" + 0.011*"training" + 0.011*"unsupervised" + 0.010*"word" + 0.010*"crf" + 0.009*"state"
topic #24 (0.010): 0.031*"sentence" + 0.027*"document" + 0.027*"summarization" + 0.027*"sentences" + 0.024*"summary" + 0.022*"summaries" + 0.012*"text" + 0.010*"documents" + 0.009*"content" + 0.006*"topic"
topic #30 (0.010): 0.072*"annotation" + 0.022*"corpus" + 0.021*"xml" + 0.020*"annotated" + 0.019*"id" + 0.017*"annotations" + 0.010*"gene" + 0.009*"format" + 0.008*"type" + 0.008*"text"
topic diff=0.954336, rho=0.275215
PROGRESS: pass 1, at document #22000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #57 (0.010): 0.091*"tree" + 0.047*"node" + 0.037*"trees" + 0.027*"nodes" + 0.012*"figure" + 0.011*"root" + 0.009*"tag" + 0.009*"structure" + 0.008*"derivation" + 0.007*"grammar"
topic #42 (0.010): 0.009*"systems" + 0.008*"processing" + 0.008*"text" + 0.007*"natural" + 0.007

topic diff=0.539801, rho=0.265349
PROGRESS: pass 2, at document #12000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #77 (0.010): 0.017*"languages" + 0.012*"word" + 0.012*"vowel" + 0.011*"phonological" + 0.010*"segmentation" + 0.010*"syllable" + 0.008*"phoneme" + 0.008*"phonetic" + 0.008*"consonant" + 0.007*"vowels"
topic #66 (0.010): 0.008*"participants" + 0.007*"social" + 0.006*"email" + 0.006*"linguistic" + 0.006*"study" + 0.005*"human" + 0.005*"conversation" + 0.005*"speakers" + 0.005*"pages" + 0.005*"studies"
topic #65 (0.010): 0.085*"features" + 0.048*"feature" + 0.021*"classification" + 0.019*"classifier" + 0.017*"training" + 0.012*"accuracy" + 0.012*"learning" + 0.011*"class" + 0.011*"table" + 0.011*"performance"
topic #94 (0.010): 0.041*"corpus" + 0.039*"text" + 0.025*"texts" + 0.014*"corpora" + 0.011*"words" + 0.008*"frequency" + 0.006*"genre" + 0.005*"length" + 0.005*"author" + 0.005*"grams"
topic #21 (0.010): 0.015*"model" + 0.012*"function

topic #39 (0.010): 0.054*"chinese" + 0.039*"word" + 0.028*"segmentation" + 0.015*"words" + 0.011*"chunk" + 0.010*"table" + 0.010*"pos" + 0.009*"corpus" + 0.008*"character" + 0.007*"korean"
topic #10 (0.010): 0.026*"languages" + 0.025*"arabic" + 0.014*"proceedings" + 0.010*"pages" + 0.009*"spanish" + 0.009*"english" + 0.008*"swedish" + 0.008*"shared" + 0.008*"en" + 0.007*"treebank"
topic #68 (0.010): 0.044*"kernel" + 0.026*"tree" + 0.019*"kernels" + 0.017*"syntactic" + 0.010*"semantic" + 0.008*"trees" + 0.008*"feature" + 0.008*"parse" + 0.008*"features" + 0.008*"learning"
topic #48 (0.010): 0.028*"plan" + 0.025*"agent" + 0.021*"model" + 0.019*"goal" + 0.014*"agents" + 0.013*"belief" + 0.011*"goals" + 0.010*"action" + 0.010*"speaker" + 0.009*"beliefs"
topic diff=0.329766, rho=0.256473
PROGRESS: pass 3, at document #4000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #97 (0.010): 0.028*"parser" + 0.026*"np" + 0.024*"parsing" + 0.017*"treebank" + 0.016*"par

topic #86 (0.010): 0.061*"speech" + 0.032*"recognition" + 0.012*"spoken" + 0.011*"training" + 0.010*"word" + 0.010*"speaker" + 0.009*"acoustic" + 0.009*"models" + 0.009*"error" + 0.009*"asr"
topic diff=0.257321, rho=0.256473
PROGRESS: pass 3, at document #18000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #84 (0.010): 0.064*"japanese" + 0.038*"method" + 0.022*"case" + 0.014*"sentences" + 0.013*"proposed" + 0.013*"sentence" + 0.013*"figure" + 0.011*"japan" + 0.010*"table" + 0.008*"ga"
topic #34 (0.010): 0.063*"web" + 0.056*"search" + 0.047*"query" + 0.027*"queries" + 0.019*"pages" + 0.013*"engine" + 0.011*"page" + 0.009*"google" + 0.007*"www" + 0.005*"http"
topic #96 (0.010): 0.066*"sentence" + 0.052*"sentences" + 0.029*"paraphrase" + 0.028*"paraphrases" + 0.012*"paraphrasing" + 0.010*"simplification" + 0.010*"text" + 0.009*"proceedings" + 0.009*"original" + 0.008*"substitution"
topic #83 (0.010): 0.280*"rules" + 0.223*"rule" + 0.014*"applied" + 0.008*

topic #90 (0.010): 0.039*"wordnet" + 0.027*"semantic" + 0.017*"lexical" + 0.015*"word" + 0.013*"words" + 0.009*"resources" + 0.008*"synsets" + 0.008*"synonyms" + 0.007*"proceedings" + 0.007*"synset"
topic #86 (0.010): 0.078*"speech" + 0.037*"recognition" + 0.016*"spoken" + 0.014*"asr" + 0.010*"acoustic" + 0.009*"word" + 0.009*"training" + 0.008*"rate" + 0.008*"error" + 0.008*"models"
topic #35 (0.010): 0.021*"agreement" + 0.020*"annotators" + 0.016*"annotation" + 0.012*"annotator" + 0.010*"resources" + 0.010*"task" + 0.009*"languages" + 0.008*"hindi" + 0.007*"annotations" + 0.007*"proceedings"
topic #24 (0.010): 0.033*"summarization" + 0.032*"sentence" + 0.029*"sentences" + 0.024*"summary" + 0.020*"document" + 0.019*"summaries" + 0.012*"text" + 0.010*"rouge" + 0.009*"content" + 0.007*"documents"
topic #48 (0.010): 0.028*"plan" + 0.023*"agent" + 0.017*"goal" + 0.017*"model" + 0.014*"speaker" + 0.012*"action" + 0.012*"belief" + 0.011*"goals" + 0.011*"act" + 0.010*"agents"
topic diff=0.17

topic #27 (0.010): 0.071*"temporal" + 0.049*"time" + 0.019*"tense" + 0.018*"expressions" + 0.017*"event" + 0.014*"events" + 0.012*"past" + 0.011*"aspect" + 0.009*"causal" + 0.009*"interval"
topic #80 (0.010): 0.069*"grammar" + 0.019*"feature" + 0.018*"grammars" + 0.015*"structure" + 0.012*"structures" + 0.012*"unification" + 0.011*"lexical" + 0.008*"type" + 0.008*"hpsg" + 0.007*"coverage"
topic #66 (0.010): 0.013*"participants" + 0.010*"study" + 0.008*"group" + 0.007*"linguistic" + 0.007*"social" + 0.006*"analysis" + 0.006*"studies" + 0.005*"differences" + 0.005*"speakers" + 0.005*"participant"
topic diff=0.188862, rho=0.248433
-8.186 per-word bound, 291.3 perplexity estimate based on a held-out corpus of 405 documents with 841110 words
PROGRESS: pass 4, at document #22405/22405
merging changes from 405 documents into a model of 22405 documents
topic #71 (0.010): 0.062*"coreference" + 0.041*"mentions" + 0.035*"mention" + 0.025*"resolution" + 0.016*"chain" + 0.015*"chains" + 0.011*"enti

topic #50 (0.010): 0.047*"argument" + 0.032*"predicate" + 0.028*"arguments" + 0.028*"role" + 0.025*"semantic" + 0.021*"task" + 0.015*"srl" + 0.014*"labeling" + 0.012*"predicates" + 0.011*"roles"
topic #34 (0.010): 0.067*"web" + 0.054*"search" + 0.052*"query" + 0.030*"queries" + 0.021*"pages" + 0.013*"page" + 0.012*"engine" + 0.010*"google" + 0.007*"www" + 0.006*"http"
topic diff=0.139418, rho=0.241104
PROGRESS: pass 5, at document #14000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #31 (0.010): 0.214*"word" + 0.174*"words" + 0.013*"dictionary" + 0.011*"text" + 0.009*"list" + 0.009*"algorithm" + 0.008*"unknown" + 0.007*"method" + 0.006*"speech" + 0.005*"correct"
topic #7 (0.010): 0.078*"noun" + 0.038*"phrase" + 0.037*"verb" + 0.027*"phrases" + 0.025*"nouns" + 0.015*"syntactic" + 0.015*"head" + 0.012*"adjective" + 0.012*"adjectives" + 0.010*"compound"
topic #47 (0.010): 0.040*"model" + 0.020*"models" + 0.018*"sequence" + 0.013*"crf" + 0.012*"hmm" + 0.01

topic #15 (0.010): 0.019*"object" + 0.016*"objects" + 0.007*"spatial" + 0.007*"point" + 0.005*"reference" + 0.005*"like" + 0.004*"way" + 0.004*"view" + 0.004*"description" + 0.004*"world"
topic diff=0.133656, rho=0.234388
PROGRESS: pass 6, at document #4000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #97 (0.010): 0.027*"parser" + 0.024*"parsing" + 0.020*"np" + 0.019*"treebank" + 0.019*"parse" + 0.010*"nn" + 0.010*"sentences" + 0.010*"syntactic" + 0.009*"penn" + 0.007*"constituent"
topic #10 (0.010): 0.065*"arabic" + 0.024*"languages" + 0.013*"proceedings" + 0.009*"habash" + 0.009*"pages" + 0.009*"resources" + 0.009*"shared" + 0.009*"english" + 0.008*"morphological" + 0.007*"swedish"
topic #96 (0.010): 0.125*"sentence" + 0.102*"sentences" + 0.020*"paraphrase" + 0.018*"paraphrases" + 0.012*"simplification" + 0.011*"text" + 0.010*"original" + 0.009*"proceedings" + 0.008*"simple" + 0.008*"paraphrasing"
topic #54 (0.010): 0.013*"interpretation" + 0.011*"t

merging changes from 2000 documents into a model of 22405 documents
topic #15 (0.010): 0.019*"object" + 0.013*"objects" + 0.006*"point" + 0.005*"reference" + 0.005*"spatial" + 0.005*"like" + 0.004*"description" + 0.004*"way" + 0.004*"understanding" + 0.004*"location"
topic #8 (0.010): 0.085*"network" + 0.038*"networks" + 0.036*"neural" + 0.029*"layer" + 0.014*"model" + 0.014*"hidden" + 0.013*"input" + 0.012*"representations" + 0.010*"representation" + 0.009*"output"
topic #78 (0.010): 0.047*"patterns" + 0.038*"extraction" + 0.032*"pattern" + 0.027*"precision" + 0.021*"extracted" + 0.019*"recall" + 0.016*"corpus" + 0.013*"method" + 0.011*"extract" + 0.011*"terms"
topic #74 (0.010): 0.023*"children" + 0.020*"acquisition" + 0.020*"child" + 0.015*"memory" + 0.014*"learning" + 0.014*"processing" + 0.013*"cognitive" + 0.012*"model" + 0.012*"learner" + 0.011*"syntactic"
topic #50 (0.010): 0.050*"argument" + 0.036*"predicate" + 0.030*"semantic" + 0.029*"arguments" + 0.029*"role" + 0.021*"task"

topic #64 (0.010): 0.082*"frame" + 0.045*"frames" + 0.032*"semantic" + 0.027*"framenet" + 0.021*"roles" + 0.021*"role" + 0.009*"fillmore" + 0.009*"lexical" + 0.008*"elements" + 0.007*"semantics"
topic #12 (0.010): 0.027*"logic" + 0.018*"type" + 0.016*"logical" + 0.015*"scope" + 0.015*"variables" + 0.013*"variable" + 0.013*"formula" + 0.012*"order" + 0.010*"proof" + 0.010*"form"
topic #26 (0.010): 0.072*"np" + 0.037*"category" + 0.027*"categories" + 0.026*"vp" + 0.014*"grammar" + 0.014*"lexical" + 0.013*"structure" + 0.012*"cat" + 0.010*"syntactic" + 0.010*"verb"
topic #24 (0.010): 0.036*"summarization" + 0.027*"summary" + 0.024*"sentence" + 0.023*"sentences" + 0.021*"summaries" + 0.021*"document" + 0.011*"text" + 0.011*"rouge" + 0.010*"content" + 0.007*"automatic"
topic diff=0.095265, rho=0.228203
PROGRESS: pass 7, at document #10000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #66 (0.010): 0.011*"study" + 0.009*"participants" + 0.008*"group" + 0.007*

topic #52 (0.010): 0.032*"expressions" + 0.021*"mwes" + 0.020*"mwe" + 0.018*"collocations" + 0.018*"multiword" + 0.016*"collocation" + 0.014*"expression" + 0.011*"literal" + 0.009*"light" + 0.008*"idiomatic"
topic #78 (0.010): 0.047*"patterns" + 0.041*"extraction" + 0.030*"pattern" + 0.028*"precision" + 0.022*"extracted" + 0.020*"recall" + 0.016*"corpus" + 0.013*"method" + 0.012*"extract" + 0.011*"terms"
topic #0 (0.010): 0.024*"czech" + 0.015*"languages" + 0.014*"russian" + 0.013*"lemma" + 0.009*"sr" + 0.009*"prague" + 0.008*"rdf" + 0.007*"case" + 0.007*"lemmas" + 0.007*"polish"
topic diff=0.124940, rho=0.228203
-8.163 per-word bound, 286.6 perplexity estimate based on a held-out corpus of 405 documents with 841110 words
PROGRESS: pass 7, at document #22405/22405
merging changes from 405 documents into a model of 22405 documents
topic #87 (0.010): 0.036*"learning" + 0.023*"state" + 0.022*"action" + 0.020*"actions" + 0.018*"robot" + 0.012*"game" + 0.011*"instructions" + 0.010*"policy" 

topic #47 (0.010): 0.040*"model" + 0.020*"models" + 0.019*"sequence" + 0.014*"crf" + 0.011*"hmm" + 0.011*"conditional" + 0.011*"unsupervised" + 0.010*"state" + 0.010*"training" + 0.009*"markov"
topic #58 (0.010): 0.037*"character" + 0.026*"characters" + 0.022*"transliteration" + 0.016*"english" + 0.015*"code" + 0.013*"letter" + 0.011*"names" + 0.009*"string" + 0.009*"table" + 0.009*"dictionary"
topic diff=0.093156, rho=0.222483
PROGRESS: pass 8, at document #14000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #48 (0.010): 0.024*"plan" + 0.019*"agent" + 0.016*"goal" + 0.015*"speaker" + 0.014*"model" + 0.013*"action" + 0.013*"belief" + 0.012*"act" + 0.010*"goals" + 0.010*"hearer"
topic #92 (0.010): 0.069*"dialogue" + 0.023*"utterance" + 0.021*"utterances" + 0.017*"dialog" + 0.012*"task" + 0.012*"speech" + 0.012*"spoken" + 0.012*"dialogues" + 0.011*"turn" + 0.010*"act"
topic #5 (0.010): 0.061*"pi" + 0.039*"van" + 0.037*"dutch" + 0.032*"nl" + 0.016*"ep" + 

topic #25 (0.010): 0.022*"string" + 0.021*"state" + 0.019*"finite" + 0.013*"strings" + 0.009*"let" + 0.009*"context" + 0.008*"symbol" + 0.008*"states" + 0.007*"symbols" + 0.007*"languages"
topic diff=0.095706, rho=0.217173
PROGRESS: pass 9, at document #4000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #97 (0.010): 0.028*"parser" + 0.025*"parsing" + 0.020*"parse" + 0.020*"treebank" + 0.016*"np" + 0.011*"nn" + 0.011*"sentences" + 0.010*"syntactic" + 0.009*"penn" + 0.007*"accuracy"
topic #78 (0.010): 0.044*"patterns" + 0.041*"extraction" + 0.028*"precision" + 0.028*"pattern" + 0.021*"extracted" + 0.020*"recall" + 0.016*"corpus" + 0.013*"method" + 0.012*"extract" + 0.011*"seed"
topic #36 (0.010): 0.068*"value" + 0.059*"attributes" + 0.057*"attribute" + 0.049*"values" + 0.043*"class" + 0.029*"type" + 0.021*"item" + 0.020*"items" + 0.019*"object" + 0.018*"schema"
topic #29 (0.010): 0.032*"vector" + 0.026*"vectors" + 0.022*"space" + 0.020*"context" + 0.018*

merging changes from 2000 documents into a model of 22405 documents
topic #71 (0.010): 0.062*"coreference" + 0.033*"mentions" + 0.032*"mention" + 0.025*"resolution" + 0.013*"entity" + 0.012*"chain" + 0.012*"chains" + 0.009*"muc" + 0.009*"ace" + 0.009*"ng"
topic #85 (0.010): 0.061*"knowledge" + 0.022*"ontology" + 0.021*"base" + 0.020*"database" + 0.014*"domain" + 0.012*"concepts" + 0.011*"natural" + 0.010*"concept" + 0.009*"expert" + 0.009*"medical"
topic #16 (0.010): 0.091*"relation" + 0.086*"relations" + 0.027*"entailment" + 0.017*"pairs" + 0.014*"textual" + 0.012*"inference" + 0.011*"pair" + 0.010*"text" + 0.009*"rte" + 0.007*"hypothesis"
topic #99 (0.010): 0.021*"biomedical" + 0.019*"protein" + 0.015*"scope" + 0.015*"gene" + 0.013*"negation" + 0.012*"medical" + 0.011*"clinical" + 0.011*"abstracts" + 0.010*"task" + 0.010*"cue"
topic #25 (0.010): 0.022*"state" + 0.021*"string" + 0.018*"finite" + 0.012*"strings" + 0.009*"context" + 0.009*"let" + 0.008*"states" + 0.008*"symbol" + 0.008*

topic #33 (0.010): 0.101*"user" + 0.029*"users" + 0.009*"interaction" + 0.009*"figure" + 0.008*"systems" + 0.008*"time" + 0.008*"interface" + 0.007*"interactive" + 0.006*"response" + 0.005*"task"
topic #25 (0.010): 0.022*"string" + 0.019*"state" + 0.016*"finite" + 0.013*"strings" + 0.010*"let" + 0.009*"context" + 0.008*"symbol" + 0.008*"languages" + 0.008*"symbols" + 0.007*"free"
topic #97 (0.010): 0.030*"parser" + 0.026*"parsing" + 0.021*"parse" + 0.020*"treebank" + 0.015*"np" + 0.011*"sentences" + 0.010*"nn" + 0.010*"syntactic" + 0.009*"penn" + 0.008*"accuracy"
topic #19 (0.010): 0.091*"alignment" + 0.034*"word" + 0.029*"alignments" + 0.023*"aligned" + 0.015*"pairs" + 0.014*"target" + 0.014*"source" + 0.013*"pair" + 0.013*"model" + 0.012*"sentence"
topic diff=0.074087, rho=0.212226
PROGRESS: pass 10, at document #10000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #86 (0.010): 0.083*"speech" + 0.040*"recognition" + 0.016*"spoken" + 0.014*"asr" + 0.01

topic #46 (0.010): 0.087*"graph" + 0.036*"nodes" + 0.033*"constraints" + 0.032*"edges" + 0.030*"edge" + 0.027*"node" + 0.027*"path" + 0.023*"graphs" + 0.018*"constraint" + 0.016*"algorithm"
topic #14 (0.010): 0.056*"training" + 0.048*"learning" + 0.019*"labeled" + 0.017*"supervised" + 0.016*"examples" + 0.015*"label" + 0.013*"labels" + 0.013*"instances" + 0.012*"performance" + 0.010*"algorithm"
topic #30 (0.010): 0.091*"annotation" + 0.031*"annotated" + 0.028*"annotations" + 0.018*"corpus" + 0.016*"xml" + 0.016*"scheme" + 0.013*"type" + 0.013*"id" + 0.010*"linguistic" + 0.009*"resources"
topic diff=0.103291, rho=0.212226
-8.150 per-word bound, 284.0 perplexity estimate based on a held-out corpus of 405 documents with 841110 words
PROGRESS: pass 10, at document #22405/22405
merging changes from 405 documents into a model of 22405 documents
topic #32 (0.010): 0.030*"grammar" + 0.029*"parsing" + 0.019*"algorithm" + 0.015*"grammars" + 0.013*"chart" + 0.011*"parse" + 0.010*"non" + 0.010*"le

topic #2 (0.010): 0.058*"segmentation" + 0.047*"segment" + 0.037*"segments" + 0.023*"boundaries" + 0.022*"text" + 0.015*"boundary" + 0.012*"cohesion" + 0.012*"structure" + 0.009*"algorithm" + 0.009*"lexical"
topic diff=0.075783, rho=0.207603
PROGRESS: pass 11, at document #14000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #90 (0.010): 0.048*"wordnet" + 0.033*"semantic" + 0.015*"lexical" + 0.010*"resources" + 0.010*"wikipedia" + 0.009*"synonyms" + 0.009*"synsets" + 0.009*"relations" + 0.008*"hypernym" + 0.008*"concepts"
topic #92 (0.010): 0.069*"dialogue" + 0.023*"utterance" + 0.021*"utterances" + 0.019*"dialog" + 0.012*"task" + 0.012*"spoken" + 0.012*"speech" + 0.012*"dialogues" + 0.011*"turn" + 0.009*"act"
topic #94 (0.010): 0.058*"corpus" + 0.049*"text" + 0.029*"texts" + 0.018*"corpora" + 0.007*"words" + 0.007*"analysis" + 0.006*"frequency" + 0.006*"genre" + 0.005*"author" + 0.005*"articles"
topic #59 (0.010): 0.039*"student" + 0.032*"students" + 0

topic diff=0.079011, rho=0.203268
PROGRESS: pass 12, at document #4000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #87 (0.010): 0.046*"action" + 0.035*"actions" + 0.034*"state" + 0.025*"learning" + 0.016*"game" + 0.013*"robot" + 0.012*"policy" + 0.012*"instructions" + 0.009*"environment" + 0.009*"states"
topic #0 (0.010): 0.022*"czech" + 0.017*"languages" + 0.014*"lemma" + 0.012*"swedish" + 0.012*"russian" + 0.011*"sr" + 0.008*"prague" + 0.007*"case" + 0.007*"lemmas" + 0.006*"rdf"
topic #88 (0.010): 0.091*"verb" + 0.069*"verbs" + 0.017*"classes" + 0.013*"class" + 0.011*"semantic" + 0.011*"metaphor" + 0.011*"object" + 0.010*"subject" + 0.009*"selectional" + 0.008*"syntactic"
topic #1 (0.010): 0.009*"news" + 0.006*"time" + 0.006*"year" + 0.006*"research" + 0.005*"people" + 0.005*"articles" + 0.004*"political" + 0.004*"government" + 0.004*"university" + 0.004*"science"
topic #69 (0.010): 0.098*"dependency" + 0.047*"parsing" + 0.022*"parser" + 0.014*"hea

topic #83 (0.010): 0.274*"rules" + 0.217*"rule" + 0.016*"applied" + 0.014*"transformation" + 0.010*"hand" + 0.009*"application" + 0.009*"apply" + 0.008*"transformations" + 0.007*"context" + 0.006*"applying"
topic #10 (0.010): 0.100*"arabic" + 0.012*"morphological" + 0.011*"languages" + 0.010*"habash" + 0.008*"basque" + 0.008*"proceedings" + 0.008*"resources" + 0.007*"standard" + 0.006*"dialectal" + 0.006*"dialects"
topic diff=0.074889, rho=0.203268
-8.084 per-word bound, 271.3 perplexity estimate based on a held-out corpus of 2000 documents with 4259383 words
PROGRESS: pass 12, at document #20000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #10 (0.010): 0.091*"arabic" + 0.014*"morphological" + 0.012*"languages" + 0.009*"basque" + 0.009*"habash" + 0.009*"proceedings" + 0.007*"resources" + 0.007*"standard" + 0.006*"gold" + 0.006*"lemma"
topic #37 (0.010): 0.042*"evaluation" + 0.022*"systems" + 0.019*"task" + 0.019*"scores" + 0.018*"score" + 0.017*"human

topic #36 (0.010): 0.092*"class" + 0.064*"value" + 0.047*"classes" + 0.046*"attribute" + 0.045*"values" + 0.045*"type" + 0.044*"attributes" + 0.024*"items" + 0.021*"item" + 0.016*"types"
topic diff=0.064034, rho=0.199195
PROGRESS: pass 13, at document #10000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #99 (0.010): 0.023*"biomedical" + 0.017*"protein" + 0.013*"gene" + 0.013*"task" + 0.012*"negation" + 0.011*"scope" + 0.011*"clinical" + 0.010*"abstracts" + 0.010*"medical" + 0.009*"extraction"
topic #12 (0.010): 0.029*"logic" + 0.018*"logical" + 0.015*"type" + 0.015*"variables" + 0.015*"scope" + 0.013*"variable" + 0.013*"formula" + 0.012*"order" + 0.011*"form" + 0.010*"proof"
topic #67 (0.010): 0.043*"text" + 0.019*"structure" + 0.009*"level" + 0.009*"knowledge" + 0.008*"generation" + 0.007*"content" + 0.007*"propositions" + 0.006*"process" + 0.006*"message" + 0.006*"texts"
topic #23 (0.010): 0.049*"morphological" + 0.024*"forms" + 0.022*"morphology" + 

PROGRESS: pass 13, at document #22405/22405
merging changes from 405 documents into a model of 22405 documents
topic #54 (0.010): 0.012*"interpretation" + 0.011*"theory" + 0.009*"john" + 0.008*"context" + 0.008*"case" + 0.007*"semantics" + 0.007*"fact" + 0.007*"like" + 0.006*"meaning" + 0.006*"possible"
topic #25 (0.010): 0.024*"string" + 0.022*"state" + 0.020*"finite" + 0.014*"strings" + 0.009*"let" + 0.009*"context" + 0.008*"states" + 0.008*"symbol" + 0.007*"symbols" + 0.007*"languages"
topic #87 (0.010): 0.050*"action" + 0.038*"actions" + 0.035*"state" + 0.024*"learning" + 0.018*"robot" + 0.015*"game" + 0.012*"instructions" + 0.011*"policy" + 0.009*"states" + 0.009*"virtual"
topic #40 (0.010): 0.081*"english" + 0.054*"languages" + 0.052*"translation" + 0.024*"parallel" + 0.022*"spanish" + 0.021*"bilingual" + 0.019*"corpora" + 0.019*"translations" + 0.014*"multilingual" + 0.012*"cross"
topic #63 (0.010): 0.056*"lexical" + 0.030*"lexicon" + 0.029*"dictionary" + 0.021*"concept" + 0.020

topic #40 (0.010): 0.085*"english" + 0.056*"translation" + 0.047*"languages" + 0.025*"bilingual" + 0.024*"parallel" + 0.020*"translations" + 0.019*"corpora" + 0.019*"spanish" + 0.013*"cross" + 0.013*"monolingual"
topic #31 (0.010): 0.270*"word" + 0.211*"words" + 0.009*"list" + 0.008*"context" + 0.007*"algorithm" + 0.007*"text" + 0.007*"unknown" + 0.007*"dictionary" + 0.006*"frequency" + 0.006*"vocabulary"
topic #90 (0.010): 0.049*"wordnet" + 0.034*"semantic" + 0.015*"lexical" + 0.011*"wikipedia" + 0.011*"resources" + 0.009*"synonyms" + 0.009*"synsets" + 0.009*"relations" + 0.009*"concepts" + 0.008*"hypernym"
topic #91 (0.010): 0.193*"model" + 0.082*"models" + 0.025*"gram" + 0.018*"training" + 0.014*"modeling" + 0.013*"word" + 0.010*"context" + 0.009*"wi" + 0.009*"lm" + 0.009*"bigram"
topic diff=0.106351, rho=0.195357
PROGRESS: pass 14, at document #16000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #93 (0.010): 0.102*"similarity" + 0.043*"pairs" + 0.0

topic #93 (0.010): 0.102*"similarity" + 0.042*"pairs" + 0.030*"measure" + 0.029*"distance" + 0.026*"measures" + 0.022*"pair" + 0.018*"similar" + 0.011*"method" + 0.009*"score" + 0.008*"sim"
topic #12 (0.010): 0.029*"logic" + 0.018*"logical" + 0.016*"type" + 0.015*"variables" + 0.014*"scope" + 0.013*"variable" + 0.011*"order" + 0.010*"form" + 0.010*"semantics" + 0.010*"st"
topic #84 (0.010): 0.058*"japanese" + 0.043*"method" + 0.017*"case" + 0.015*"proposed" + 0.014*"figure" + 0.011*"table" + 0.010*"japan" + 0.009*"jp" + 0.008*"corpus" + 0.008*"shows"
topic diff=0.096083, rho=0.191732
PROGRESS: pass 15, at document #6000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #90 (0.010): 0.047*"wordnet" + 0.035*"semantic" + 0.015*"lexical" + 0.013*"resources" + 0.012*"wikipedia" + 0.010*"synsets" + 0.009*"relations" + 0.008*"synset" + 0.008*"synonyms" + 0.008*"concepts"
topic #74 (0.010): 0.026*"learning" + 0.022*"model" + 0.021*"acquisition" + 0.018*"children" 

topic diff=0.067397, rho=0.191732
-8.081 per-word bound, 270.8 perplexity estimate based on a held-out corpus of 2000 documents with 4259383 words
PROGRESS: pass 15, at document #20000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #76 (0.010): 0.090*"sense" + 0.043*"senses" + 0.034*"word" + 0.034*"disambiguation" + 0.016*"wsd" + 0.016*"context" + 0.011*"target" + 0.010*"words" + 0.010*"corpus" + 0.009*"task"
topic #30 (0.010): 0.093*"annotation" + 0.030*"annotated" + 0.027*"annotations" + 0.017*"corpus" + 0.017*"xml" + 0.015*"scheme" + 0.013*"id" + 0.012*"type" + 0.011*"linguistic" + 0.009*"resources"
topic #17 (0.010): 0.189*"event" + 0.101*"events" + 0.030*"trigger" + 0.022*"story" + 0.013*"extraction" + 0.013*"theme" + 0.013*"type" + 0.012*"stories" + 0.011*"triggers" + 0.011*"types"
topic #92 (0.010): 0.067*"dialogue" + 0.023*"utterance" + 0.022*"utterances" + 0.018*"dialog" + 0.013*"spoken" + 0.012*"speech" + 0.012*"task" + 0.012*"dialogues" + 0.0

topic #35 (0.010): 0.036*"agreement" + 0.030*"annotators" + 0.019*"annotation" + 0.019*"annotator" + 0.011*"hindi" + 0.010*"annotations" + 0.008*"inter" + 0.008*"urdu" + 0.008*"task" + 0.008*"annotated"
topic #67 (0.010): 0.043*"text" + 0.020*"structure" + 0.010*"level" + 0.009*"knowledge" + 0.008*"generation" + 0.007*"content" + 0.007*"propositions" + 0.006*"process" + 0.006*"message" + 0.006*"texts"
topic #43 (0.010): 0.030*"german" + 0.029*"clause" + 0.020*"verb" + 0.017*"case" + 0.016*"order" + 0.015*"clauses" + 0.012*"subject" + 0.008*"syntactic" + 0.008*"structure" + 0.007*"sentence"
topic #10 (0.010): 0.096*"arabic" + 0.013*"morphological" + 0.011*"habash" + 0.009*"dialects" + 0.009*"languages" + 0.009*"da" + 0.009*"standard" + 0.008*"resources" + 0.008*"dialectal" + 0.008*"proceedings"
topic #86 (0.010): 0.084*"speech" + 0.041*"recognition" + 0.016*"spoken" + 0.014*"asr" + 0.010*"acoustic" + 0.009*"rate" + 0.009*"error" + 0.009*"training" + 0.008*"speaker" + 0.008*"performance"

topic #1 (0.010): 0.007*"news" + 0.006*"year" + 0.006*"time" + 0.006*"research" + 0.005*"people" + 0.005*"university" + 0.004*"government" + 0.004*"science" + 0.004*"political" + 0.004*"national"
topic #31 (0.010): 0.274*"word" + 0.218*"words" + 0.010*"list" + 0.009*"context" + 0.007*"algorithm" + 0.007*"frequency" + 0.007*"vocabulary" + 0.007*"dictionary" + 0.006*"text" + 0.006*"unknown"
topic #41 (0.010): 0.034*"translation" + 0.033*"french" + 0.019*"la" + 0.012*"mt" + 0.012*"le" + 0.011*"fr" + 0.010*"transfer" + 0.010*"machine" + 0.007*"des" + 0.007*"france"
topic #48 (0.010): 0.023*"plan" + 0.020*"agent" + 0.018*"speaker" + 0.015*"goal" + 0.013*"act" + 0.013*"model" + 0.013*"belief" + 0.011*"action" + 0.010*"agents" + 0.010*"goals"
topic diff=0.099949, rho=0.188303
PROGRESS: pass 17, at document #2000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #12 (0.010): 0.029*"logic" + 0.018*"logical" + 0.016*"type" + 0.015*"variables" + 0.013*"scope" + 0.013

topic #0 (0.010): 0.024*"czech" + 0.020*"languages" + 0.017*"lemma" + 0.013*"russian" + 0.013*"sr" + 0.012*"swedish" + 0.009*"prague" + 0.008*"case" + 0.007*"lemmas" + 0.006*"hungarian"
topic diff=0.098463, rho=0.185050
PROGRESS: pass 17, at document #16000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #0 (0.010): 0.029*"czech" + 0.018*"languages" + 0.017*"lemma" + 0.015*"russian" + 0.014*"sr" + 0.011*"swedish" + 0.009*"prague" + 0.008*"lemmas" + 0.008*"case" + 0.006*"hungarian"
topic #82 (0.010): 0.077*"pos" + 0.056*"tag" + 0.051*"tags" + 0.042*"tagging" + 0.028*"tagger" + 0.022*"speech" + 0.021*"corpus" + 0.016*"tagged" + 0.013*"chunk" + 0.012*"training"
topic #53 (0.010): 0.165*"discourse" + 0.042*"relations" + 0.031*"relation" + 0.025*"structure" + 0.024*"coherence" + 0.016*"rhetorical" + 0.014*"text" + 0.013*"connectives" + 0.011*"causal" + 0.010*"rst"
topic #44 (0.010): 0.012*"ll" + 0.010*"test" + 0.007*"frequency" + 0.007*"association" + 0.006*"

topic diff=0.089127, rho=0.181961
PROGRESS: pass 18, at document #6000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #33 (0.010): 0.101*"user" + 0.030*"users" + 0.011*"figure" + 0.009*"interaction" + 0.009*"interface" + 0.008*"time" + 0.007*"interactive" + 0.007*"systems" + 0.005*"response" + 0.004*"input"
topic #0 (0.010): 0.022*"czech" + 0.018*"languages" + 0.015*"lemma" + 0.012*"swedish" + 0.012*"russian" + 0.010*"sr" + 0.008*"prague" + 0.008*"case" + 0.007*"lemmas" + 0.006*"polish"
topic #32 (0.010): 0.032*"parsing" + 0.030*"grammar" + 0.018*"algorithm" + 0.015*"grammars" + 0.012*"chart" + 0.012*"parse" + 0.011*"non" + 0.010*"left" + 0.009*"derivation" + 0.009*"forest"
topic #37 (0.010): 0.045*"evaluation" + 0.022*"systems" + 0.021*"task" + 0.019*"human" + 0.019*"scores" + 0.017*"score" + 0.016*"test" + 0.014*"metrics" + 0.012*"performance" + 0.011*"quality"
topic #94 (0.010): 0.062*"corpus" + 0.051*"text" + 0.030*"texts" + 0.019*"corpora" + 0.008*

topic #83 (0.010): 0.271*"rules" + 0.212*"rule" + 0.015*"applied" + 0.014*"transformation" + 0.010*"hand" + 0.010*"application" + 0.009*"apply" + 0.009*"transformations" + 0.008*"context" + 0.006*"grammar"
topic #60 (0.010): 0.033*"probability" + 0.015*"probabilities" + 0.014*"distribution" + 0.009*"corpus" + 0.009*"statistical" + 0.008*"parameters" + 0.008*"probabilistic" + 0.008*"entropy" + 0.008*"estimate" + 0.008*"likelihood"
topic #59 (0.010): 0.046*"students" + 0.036*"student" + 0.019*"reading" + 0.016*"learning" + 0.013*"responses" + 0.012*"course" + 0.011*"level" + 0.011*"essay" + 0.010*"readability" + 0.010*"writing"
topic #26 (0.010): 0.117*"np" + 0.048*"category" + 0.041*"vp" + 0.034*"categories" + 0.017*"pp" + 0.013*"grammar" + 0.011*"structure" + 0.010*"cat" + 0.010*"phrase" + 0.010*"type"
topic diff=0.057251, rho=0.181961
PROGRESS: pass 18, at document #22000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #86 (0.010): 0.076*"speech" + 0.03

topic #51 (0.010): 0.074*"entity" + 0.046*"entities" + 0.034*"named" + 0.022*"names" + 0.018*"wikipedia" + 0.016*"person" + 0.016*"ne" + 0.014*"ner" + 0.011*"recognition" + 0.010*"extraction"
topic #98 (0.010): 0.070*"candidate" + 0.051*"score" + 0.042*"ranking" + 0.041*"best" + 0.039*"candidates" + 0.027*"rank" + 0.027*"list" + 0.022*"scores" + 0.015*"ranked" + 0.015*"method"
topic #88 (0.010): 0.103*"verb" + 0.073*"verbs" + 0.014*"classes" + 0.013*"object" + 0.012*"subject" + 0.010*"selectional" + 0.010*"semantic" + 0.010*"metaphor" + 0.009*"class" + 0.008*"syntactic"
topic diff=0.069392, rho=0.179022
PROGRESS: pass 19, at document #12000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #39 (0.010): 0.071*"chinese" + 0.016*"segmentation" + 0.016*"word" + 0.012*"table" + 0.011*"method" + 0.010*"character" + 0.009*"corpus" + 0.008*"performance" + 0.007*"korean" + 0.007*"china"
topic #61 (0.010): 0.027*"social" + 0.026*"tweets" + 0.023*"twitter" + 0.015*"m

topic #13 (0.010): 0.035*"speech" + 0.030*"prosodic" + 0.020*"cues" + 0.020*"cue" + 0.018*"pitch" + 0.014*"prosody" + 0.012*"speaker" + 0.012*"duration" + 0.012*"phrase" + 0.012*"pause"
topic #78 (0.010): 0.042*"patterns" + 0.040*"extraction" + 0.033*"precision" + 0.027*"pattern" + 0.024*"recall" + 0.022*"extracted" + 0.017*"corpus" + 0.013*"method" + 0.012*"extract" + 0.010*"seed"
topic diff=0.093851, rho=0.179022
PROGRESS: pass 20, at document #2000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #1 (0.010): 0.007*"news" + 0.006*"year" + 0.006*"time" + 0.006*"research" + 0.005*"people" + 0.005*"university" + 0.005*"government" + 0.004*"science" + 0.004*"political" + 0.004*"national"
topic #51 (0.010): 0.071*"entity" + 0.046*"entities" + 0.039*"named" + 0.026*"names" + 0.020*"ne" + 0.018*"person" + 0.017*"ner" + 0.016*"wikipedia" + 0.014*"recognition" + 0.010*"location"
topic #37 (0.010): 0.046*"evaluation" + 0.023*"task" + 0.023*"systems" + 0.019*"huma

topic #17 (0.010): 0.191*"event" + 0.100*"events" + 0.025*"trigger" + 0.023*"story" + 0.013*"stories" + 0.013*"extraction" + 0.012*"type" + 0.011*"detection" + 0.010*"theme" + 0.010*"types"
topic #60 (0.010): 0.033*"probability" + 0.015*"probabilities" + 0.014*"distribution" + 0.009*"corpus" + 0.009*"parameters" + 0.008*"statistical" + 0.008*"probabilistic" + 0.008*"estimate" + 0.008*"likelihood" + 0.008*"log"
topic #77 (0.010): 0.019*"vowel" + 0.017*"phonological" + 0.017*"phoneme" + 0.016*"syllable" + 0.015*"languages" + 0.014*"stress" + 0.012*"phonetic" + 0.010*"vowels" + 0.010*"consonant" + 0.010*"phonemes"
topic #56 (0.010): 0.027*"ii" + 0.023*"en" + 0.021*"es" + 0.020*"si" + 0.019*"ti" + 0.017*"el" + 0.017*"di" + 0.016*"er" + 0.014*"sp" + 0.013*"ct"
topic diff=0.077912, rho=0.176220
PROGRESS: pass 20, at document #18000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #71 (0.010): 0.062*"coreference" + 0.034*"mentions" + 0.033*"mention" + 0.027*"res

topic #76 (0.010): 0.094*"sense" + 0.044*"senses" + 0.037*"disambiguation" + 0.034*"word" + 0.018*"wsd" + 0.015*"context" + 0.010*"corpus" + 0.010*"target" + 0.010*"words" + 0.009*"ambiguous"
topic #18 (0.010): 0.034*"markers" + 0.031*"marker" + 0.027*"style" + 0.021*"hebrew" + 0.021*"coordination" + 0.018*"punctuation" + 0.017*"syntactic" + 0.015*"conjunction" + 0.015*"comparative" + 0.013*"conjunctions"
topic #77 (0.010): 0.017*"syllable" + 0.016*"vowel" + 0.016*"phonological" + 0.016*"languages" + 0.014*"phoneme" + 0.013*"stress" + 0.011*"phonetic" + 0.010*"dialect" + 0.010*"consonant" + 0.009*"vowels"
topic diff=0.087836, rho=0.173546
PROGRESS: pass 21, at document #8000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #28 (0.010): 0.008*"communication" + 0.007*"human" + 0.007*"sign" + 0.007*"people" + 0.007*"social" + 0.005*"mental" + 0.005*"life" + 0.005*"narrative" + 0.005*"depression" + 0.004*"person"
topic #21 (0.010): 0.021*"algorithm" + 0.017*"

topic #65 (0.010): 0.095*"features" + 0.054*"feature" + 0.024*"classification" + 0.021*"classifier" + 0.016*"training" + 0.014*"table" + 0.013*"accuracy" + 0.013*"task" + 0.012*"performance" + 0.011*"learning"
topic #56 (0.010): 0.033*"ii" + 0.022*"en" + 0.021*"es" + 0.021*"si" + 0.019*"ti" + 0.016*"di" + 0.016*"el" + 0.014*"er" + 0.014*"sp" + 0.013*"ct"
topic diff=0.053371, rho=0.173546
PROGRESS: pass 21, at document #22000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #69 (0.010): 0.102*"dependency" + 0.044*"parsing" + 0.022*"parser" + 0.015*"head" + 0.015*"dependencies" + 0.010*"nivre" + 0.010*"parsers" + 0.009*"projective" + 0.007*"non" + 0.007*"conll"
topic #35 (0.010): 0.039*"agreement" + 0.032*"annotators" + 0.021*"annotation" + 0.020*"annotator" + 0.011*"hindi" + 0.011*"annotations" + 0.009*"inter" + 0.008*"annotated" + 0.008*"task" + 0.007*"gold"
topic #68 (0.010): 0.046*"kernel" + 0.033*"svm" + 0.020*"kernels" + 0.014*"tree" + 0.013*"learning

topic diff=0.065245, rho=0.170990
PROGRESS: pass 22, at document #12000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #29 (0.010): 0.034*"vector" + 0.028*"vectors" + 0.026*"space" + 0.022*"matrix" + 0.021*"context" + 0.015*"distributional" + 0.014*"semantic" + 0.012*"models" + 0.010*"model" + 0.008*"lsa"
topic #57 (0.010): 0.131*"tree" + 0.061*"node" + 0.050*"trees" + 0.035*"nodes" + 0.017*"figure" + 0.016*"root" + 0.015*"structure" + 0.010*"tag" + 0.010*"structures" + 0.009*"derivation"
topic #58 (0.010): 0.040*"character" + 0.028*"characters" + 0.022*"transliteration" + 0.017*"english" + 0.017*"code" + 0.013*"letter" + 0.012*"names" + 0.009*"table" + 0.009*"dictionary" + 0.009*"string"
topic #38 (0.010): 0.155*"fl" + 0.141*"fi" + 0.103*"ff" + 0.069*"ffi" + 0.034*"citation" + 0.022*"papers" + 0.018*"scientific" + 0.013*"citations" + 0.008*"cited" + 0.007*"zone"
topic #69 (0.010): 0.100*"dependency" + 0.045*"parsing" + 0.021*"parser" + 0.015*"head" + 0

topic #63 (0.010): 0.079*"lexical" + 0.047*"lexicon" + 0.040*"dictionary" + 0.026*"entries" + 0.020*"entry" + 0.016*"dictionaries" + 0.012*"definition" + 0.010*"definitions" + 0.008*"lexicons" + 0.008*"lex"
topic #21 (0.010): 0.021*"algorithm" + 0.017*"function" + 0.011*"problem" + 0.009*"linear" + 0.009*"weight" + 0.008*"constraints" + 0.007*"algorithms" + 0.007*"weights" + 0.007*"xi" + 0.006*"optimization"
topic #20 (0.010): 0.030*"template" + 0.022*"slot" + 0.022*"templates" + 0.019*"muc" + 0.016*"text" + 0.014*"slots" + 0.008*"message" + 0.007*"systems" + 0.007*"type" + 0.007*"task"
topic #46 (0.010): 0.094*"graph" + 0.044*"nodes" + 0.036*"edges" + 0.033*"node" + 0.033*"edge" + 0.030*"path" + 0.025*"graphs" + 0.018*"figure" + 0.017*"algorithm" + 0.017*"links"
topic diff=0.054987, rho=0.168544
PROGRESS: pass 23, at document #4000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #92 (0.010): 0.067*"dialogue" + 0.023*"utterance" + 0.022*"utterances" + 0.

topic #59 (0.010): 0.038*"student" + 0.034*"students" + 0.020*"reading" + 0.017*"responses" + 0.017*"learning" + 0.015*"response" + 0.013*"readability" + 0.012*"essay" + 0.012*"tutor" + 0.011*"level"
topic #26 (0.010): 0.125*"np" + 0.046*"category" + 0.045*"vp" + 0.033*"categories" + 0.018*"pp" + 0.012*"grammar" + 0.011*"structure" + 0.010*"phrase" + 0.010*"type" + 0.009*"ccg"
topic diff=0.073560, rho=0.168544
PROGRESS: pass 23, at document #18000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #39 (0.010): 0.073*"chinese" + 0.017*"segmentation" + 0.015*"word" + 0.012*"table" + 0.011*"character" + 0.010*"method" + 0.009*"corpus" + 0.008*"performance" + 0.007*"korean" + 0.007*"processing"
topic #85 (0.010): 0.053*"knowledge" + 0.044*"concept" + 0.038*"concepts" + 0.022*"ontology" + 0.015*"domain" + 0.015*"base" + 0.012*"database" + 0.012*"conceptual" + 0.011*"medical" + 0.010*"terms"
topic #86 (0.010): 0.073*"speech" + 0.037*"recognition" + 0.013*"spoken"

topic #68 (0.010): 0.047*"kernel" + 0.029*"svm" + 0.020*"kernels" + 0.015*"tree" + 0.012*"learning" + 0.011*"vector" + 0.010*"support" + 0.008*"feature" + 0.008*"syntactic" + 0.008*"extraction"
topic diff=0.082788, rho=0.166200
PROGRESS: pass 24, at document #8000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #73 (0.010): 0.021*"tion" + 0.018*"ing" + 0.009*"pro" + 0.008*"ex" + 0.007*"com" + 0.007*"tions" + 0.006*"tile" + 0.005*"dis" + 0.004*"ap" + 0.004*"lan"
topic #33 (0.010): 0.104*"user" + 0.030*"users" + 0.011*"figure" + 0.009*"interface" + 0.009*"interaction" + 0.008*"time" + 0.008*"interactive" + 0.007*"systems" + 0.004*"feedback" + 0.004*"input"
topic #27 (0.010): 0.076*"temporal" + 0.057*"time" + 0.024*"tense" + 0.021*"event" + 0.016*"events" + 0.016*"expressions" + 0.014*"past" + 0.014*"aspect" + 0.009*"date" + 0.009*"interval"
topic #39 (0.010): 0.068*"chinese" + 0.015*"word" + 0.014*"segmentation" + 0.012*"table" + 0.011*"method" + 0.010*"ch

topic #88 (0.010): 0.101*"verb" + 0.071*"verbs" + 0.014*"object" + 0.013*"classes" + 0.013*"subject" + 0.012*"metaphor" + 0.009*"selectional" + 0.009*"semantic" + 0.008*"class" + 0.007*"syntactic"
topic #33 (0.010): 0.097*"user" + 0.028*"users" + 0.011*"figure" + 0.010*"interface" + 0.009*"interaction" + 0.008*"time" + 0.007*"systems" + 0.007*"interactive" + 0.004*"feedback" + 0.004*"response"
topic #6 (0.010): 0.256*"domain" + 0.128*"target" + 0.061*"source" + 0.057*"domains" + 0.045*"adaptation" + 0.027*"specific" + 0.010*"general" + 0.008*"transfer" + 0.006*"adapted" + 0.006*"cross"
topic #46 (0.010): 0.099*"graph" + 0.044*"nodes" + 0.036*"edges" + 0.035*"node" + 0.035*"edge" + 0.030*"path" + 0.025*"graphs" + 0.019*"figure" + 0.017*"algorithm" + 0.016*"paths"
topic diff=0.070381, rho=0.166200
-8.128 per-word bound, 279.8 perplexity estimate based on a held-out corpus of 405 documents with 841110 words
PROGRESS: pass 24, at document #22405/22405
merging changes from 405 documents int

topic #65 (0.010): 0.099*"features" + 0.054*"feature" + 0.024*"classification" + 0.021*"classifier" + 0.015*"training" + 0.014*"table" + 0.014*"accuracy" + 0.012*"performance" + 0.012*"task" + 0.011*"learning"
topic #51 (0.010): 0.075*"entity" + 0.046*"entities" + 0.035*"named" + 0.023*"names" + 0.019*"ne" + 0.017*"wikipedia" + 0.017*"person" + 0.016*"ner" + 0.011*"recognition" + 0.011*"extraction"
topic diff=0.050615, rho=0.163951
PROGRESS: pass 25, at document #14000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #25 (0.010): 0.023*"string" + 0.022*"state" + 0.018*"finite" + 0.013*"strings" + 0.009*"let" + 0.009*"context" + 0.009*"states" + 0.008*"symbol" + 0.008*"symbols" + 0.008*"languages"
topic #53 (0.010): 0.162*"discourse" + 0.042*"relations" + 0.032*"relation" + 0.024*"coherence" + 0.024*"structure" + 0.016*"rhetorical" + 0.014*"text" + 0.013*"connectives" + 0.013*"causal" + 0.010*"rst"
topic #51 (0.010): 0.075*"entity" + 0.046*"entities" + 0.0

topic #57 (0.010): 0.130*"tree" + 0.062*"node" + 0.050*"trees" + 0.036*"nodes" + 0.018*"figure" + 0.015*"root" + 0.015*"structure" + 0.011*"tag" + 0.010*"structures" + 0.008*"derivation"
topic diff=0.051492, rho=0.161791
PROGRESS: pass 26, at document #4000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #70 (0.010): 0.176*"semantic" + 0.049*"syntactic" + 0.037*"representation" + 0.029*"semantics" + 0.023*"meaning" + 0.016*"representations" + 0.016*"natural" + 0.012*"syntax" + 0.012*"parsing" + 0.011*"form"
topic #82 (0.010): 0.082*"pos" + 0.052*"tag" + 0.050*"tags" + 0.043*"tagging" + 0.028*"tagger" + 0.021*"speech" + 0.020*"corpus" + 0.015*"msa" + 0.015*"tagged" + 0.013*"chunk"
topic #24 (0.010): 0.037*"summarization" + 0.029*"summary" + 0.025*"summaries" + 0.021*"document" + 0.016*"sentences" + 0.016*"sentence" + 0.012*"rouge" + 0.012*"text" + 0.010*"compression" + 0.010*"content"
topic #44 (0.010): 0.011*"ll" + 0.009*"test" + 0.008*"association" + 0.

topic #68 (0.010): 0.046*"kernel" + 0.032*"svm" + 0.019*"kernels" + 0.014*"tree" + 0.013*"learning" + 0.011*"vector" + 0.010*"support" + 0.008*"svms" + 0.008*"feature" + 0.008*"syntactic"
topic #54 (0.010): 0.010*"theory" + 0.009*"interpretation" + 0.008*"case" + 0.008*"context" + 0.008*"john" + 0.007*"fact" + 0.007*"possible" + 0.007*"like" + 0.006*"way" + 0.006*"does"
topic #69 (0.010): 0.104*"dependency" + 0.044*"parsing" + 0.021*"parser" + 0.016*"head" + 0.014*"dependencies" + 0.010*"nivre" + 0.010*"parsers" + 0.010*"projective" + 0.008*"conll" + 0.007*"mcdonald"
topic #77 (0.010): 0.020*"vowel" + 0.018*"phoneme" + 0.018*"syllable" + 0.016*"phonological" + 0.015*"languages" + 0.014*"stress" + 0.012*"phonetic" + 0.011*"vowels" + 0.011*"consonant" + 0.010*"phonemes"
topic diff=0.052507, rho=0.161791
-8.076 per-word bound, 269.8 perplexity estimate based on a held-out corpus of 2000 documents with 4259383 words
PROGRESS: pass 26, at document #20000/22405
merging changes from 2000 docu

topic #78 (0.010): 0.041*"patterns" + 0.038*"extraction" + 0.033*"precision" + 0.027*"pattern" + 0.024*"recall" + 0.020*"extracted" + 0.017*"corpus" + 0.013*"method" + 0.011*"extract" + 0.010*"table"
topic #94 (0.010): 0.063*"corpus" + 0.053*"text" + 0.031*"texts" + 0.020*"corpora" + 0.008*"words" + 0.008*"articles" + 0.006*"analysis" + 0.005*"genre" + 0.005*"table" + 0.005*"frequency"
topic #12 (0.010): 0.027*"logic" + 0.021*"logical" + 0.014*"scope" + 0.013*"variables" + 0.012*"variable" + 0.012*"form" + 0.012*"semantics" + 0.011*"order" + 0.011*"formula" + 0.011*"type"
topic diff=0.046429, rho=0.159714
PROGRESS: pass 27, at document #10000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #85 (0.010): 0.051*"knowledge" + 0.047*"concept" + 0.041*"concepts" + 0.024*"ontology" + 0.014*"domain" + 0.013*"conceptual" + 0.013*"base" + 0.012*"medical" + 0.012*"terms" + 0.011*"database"
topic #77 (0.010): 0.018*"syllable" + 0.017*"phonological" + 0.017*"vowel" +

topic #80 (0.010): 0.067*"grammar" + 0.025*"feature" + 0.019*"structure" + 0.017*"grammars" + 0.016*"constraints" + 0.014*"unification" + 0.013*"structures" + 0.011*"lexical" + 0.010*"constraint" + 0.009*"head"
topic #90 (0.010): 0.050*"wordnet" + 0.038*"semantic" + 0.015*"lexical" + 0.012*"resources" + 0.011*"wikipedia" + 0.011*"synsets" + 0.010*"relations" + 0.009*"synset" + 0.009*"synonyms" + 0.008*"concepts"
topic diff=0.066948, rho=0.159714
-8.126 per-word bound, 279.4 perplexity estimate based on a held-out corpus of 405 documents with 841110 words
PROGRESS: pass 27, at document #22405/22405
merging changes from 405 documents into a model of 22405 documents
topic #87 (0.010): 0.051*"action" + 0.042*"state" + 0.039*"actions" + 0.021*"learning" + 0.017*"robot" + 0.016*"game" + 0.012*"policy" + 0.012*"instructions" + 0.012*"simulation" + 0.012*"states"
topic #31 (0.010): 0.286*"word" + 0.226*"words" + 0.012*"context" + 0.009*"list" + 0.007*"frequency" + 0.007*"vocabulary" + 0.006*"u

topic #32 (0.010): 0.031*"parsing" + 0.031*"grammar" + 0.018*"algorithm" + 0.016*"grammars" + 0.012*"chart" + 0.011*"parse" + 0.011*"non" + 0.011*"left" + 0.009*"forest" + 0.009*"derivation"
topic diff=0.048151, rho=0.157715
PROGRESS: pass 28, at document #14000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #97 (0.010): 0.032*"parser" + 0.028*"parsing" + 0.023*"parse" + 0.020*"treebank" + 0.011*"sentences" + 0.010*"syntactic" + 0.009*"nn" + 0.009*"penn" + 0.009*"parses" + 0.008*"accuracy"
topic #15 (0.010): 0.019*"object" + 0.012*"objects" + 0.006*"knowledge" + 0.005*"like" + 0.005*"world" + 0.005*"reference" + 0.005*"man" + 0.005*"description" + 0.005*"understanding" + 0.004*"point"
topic #51 (0.010): 0.075*"entity" + 0.046*"entities" + 0.036*"named" + 0.023*"names" + 0.020*"ne" + 0.017*"person" + 0.016*"wikipedia" + 0.016*"ner" + 0.012*"recognition" + 0.011*"extraction"
topic #37 (0.010): 0.048*"evaluation" + 0.025*"systems" + 0.023*"task" + 0.020*"h

PROGRESS: pass 29, at document #4000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #69 (0.010): 0.101*"dependency" + 0.047*"parsing" + 0.021*"parser" + 0.014*"head" + 0.014*"dependencies" + 0.011*"nivre" + 0.010*"parsers" + 0.009*"projective" + 0.008*"mcdonald" + 0.007*"pos"
topic #51 (0.010): 0.074*"entity" + 0.046*"entities" + 0.038*"named" + 0.023*"names" + 0.018*"ne" + 0.018*"wikipedia" + 0.017*"person" + 0.016*"ner" + 0.012*"recognition" + 0.011*"extraction"
topic #60 (0.010): 0.035*"probability" + 0.015*"probabilities" + 0.015*"distribution" + 0.009*"corpus" + 0.009*"parameters" + 0.009*"log" + 0.008*"probabilistic" + 0.008*"likelihood" + 0.008*"statistical" + 0.008*"estimate"
topic #81 (0.010): 0.040*"ion" + 0.023*"ing" + 0.014*"ly" + 0.013*"th" + 0.013*"le" + 0.012*"ic" + 0.012*"ions" + 0.010*"te" + 0.010*"fo" + 0.008*"ive"
topic #44 (0.010): 0.010*"ll" + 0.009*"test" + 0.008*"association" + 0.007*"associations" + 0.007*"cases" + 0.007*"large" 

topic #56 (0.010): 0.035*"ii" + 0.021*"ti" + 0.021*"si" + 0.019*"es" + 0.019*"en" + 0.016*"di" + 0.016*"el" + 0.015*"er" + 0.014*"sp" + 0.013*"ct"
topic #3 (0.010): 0.037*"multimodal" + 0.029*"map" + 0.024*"gesture" + 0.021*"cm" + 0.020*"modality" + 0.017*"modal" + 0.017*"stream" + 0.016*"gaze" + 0.016*"gestures" + 0.015*"input"
topic #21 (0.010): 0.021*"algorithm" + 0.017*"function" + 0.011*"problem" + 0.010*"constraints" + 0.009*"linear" + 0.008*"weight" + 0.007*"algorithms" + 0.007*"weights" + 0.007*"xi" + 0.007*"optimization"
topic #49 (0.010): 0.049*"image" + 0.034*"visual" + 0.026*"images" + 0.021*"scene" + 0.020*"video" + 0.020*"spatial" + 0.016*"objects" + 0.016*"object" + 0.015*"text" + 0.013*"region"
topic diff=0.050016, rho=0.155790
-8.075 per-word bound, 269.7 perplexity estimate based on a held-out corpus of 2000 documents with 4259383 words
PROGRESS: pass 29, at document #20000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #54 (0.010): 0.

topic #76 (0.010): 0.094*"sense" + 0.044*"senses" + 0.037*"disambiguation" + 0.034*"word" + 0.018*"wsd" + 0.015*"context" + 0.010*"target" + 0.010*"corpus" + 0.010*"words" + 0.009*"ambiguous"
topic #18 (0.010): 0.034*"markers" + 0.032*"marker" + 0.028*"style" + 0.023*"coordination" + 0.020*"syntactic" + 0.019*"hebrew" + 0.017*"punctuation" + 0.017*"clause" + 0.017*"conjunction" + 0.014*"comparative"
topic diff=0.044424, rho=0.153933
PROGRESS: pass 30, at document #10000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #59 (0.010): 0.036*"students" + 0.032*"student" + 0.021*"reading" + 0.017*"responses" + 0.017*"readability" + 0.016*"learning" + 0.014*"response" + 0.013*"essay" + 0.011*"level" + 0.011*"writing"
topic #36 (0.010): 0.129*"class" + 0.103*"type" + 0.074*"classes" + 0.051*"types" + 0.043*"value" + 0.039*"attributes" + 0.039*"attribute" + 0.033*"values" + 0.016*"instance" + 0.015*"instances"
topic #45 (0.010): 0.078*"generation" + 0.016*"generat

topic #54 (0.010): 0.009*"theory" + 0.009*"case" + 0.009*"interpretation" + 0.008*"context" + 0.008*"fact" + 0.007*"possible" + 0.007*"way" + 0.007*"like" + 0.007*"john" + 0.007*"does"
topic diff=0.064018, rho=0.153933
-8.124 per-word bound, 278.9 perplexity estimate based on a held-out corpus of 405 documents with 841110 words
PROGRESS: pass 30, at document #22405/22405
merging changes from 405 documents into a model of 22405 documents
topic #25 (0.010): 0.025*"string" + 0.024*"state" + 0.021*"finite" + 0.015*"strings" + 0.010*"let" + 0.009*"context" + 0.009*"states" + 0.008*"symbol" + 0.008*"regular" + 0.008*"symbols"
topic #96 (0.010): 0.206*"sentence" + 0.163*"sentences" + 0.019*"paraphrases" + 0.018*"paraphrase" + 0.012*"simplification" + 0.011*"text" + 0.010*"original" + 0.009*"simple" + 0.008*"paraphrasing" + 0.007*"syntactic"
topic #5 (0.010): 0.056*"pi" + 0.054*"dutch" + 0.049*"van" + 0.035*"nl" + 0.025*"ds" + 0.019*"ep" + 0.015*"lr" + 0.013*"daelemans" + 0.013*"memory" + 0.01

topic diff=0.046065, rho=0.152141
PROGRESS: pass 31, at document #14000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #84 (0.010): 0.058*"japanese" + 0.040*"method" + 0.021*"case" + 0.014*"proposed" + 0.013*"figure" + 0.011*"table" + 0.010*"japan" + 0.009*"ga" + 0.008*"analysis" + 0.008*"shows"
topic #87 (0.010): 0.051*"action" + 0.049*"state" + 0.036*"actions" + 0.021*"learning" + 0.017*"policy" + 0.016*"game" + 0.014*"robot" + 0.013*"states" + 0.011*"instructions" + 0.010*"simulation"
topic #85 (0.010): 0.051*"knowledge" + 0.047*"concept" + 0.041*"concepts" + 0.022*"ontology" + 0.014*"domain" + 0.014*"base" + 0.013*"conceptual" + 0.012*"medical" + 0.012*"terms" + 0.011*"database"
topic #92 (0.010): 0.068*"dialogue" + 0.022*"utterance" + 0.020*"utterances" + 0.020*"dialog" + 0.013*"spoken" + 0.012*"task" + 0.012*"dialogues" + 0.012*"speech" + 0.011*"turn" + 0.009*"act"
topic #71 (0.010): 0.056*"coreference" + 0.035*"mentions" + 0.034*"mention" + 0.027

merging changes from 2000 documents into a model of 22405 documents
topic #10 (0.010): 0.099*"arabic" + 0.013*"morphological" + 0.013*"habash" + 0.011*"dialects" + 0.010*"dialectal" + 0.009*"da" + 0.009*"standard" + 0.008*"resources" + 0.007*"proceedings" + 0.007*"languages"
topic #76 (0.010): 0.094*"sense" + 0.044*"senses" + 0.035*"disambiguation" + 0.034*"word" + 0.018*"wsd" + 0.015*"context" + 0.010*"corpus" + 0.010*"target" + 0.010*"words" + 0.009*"task"
topic #14 (0.010): 0.058*"training" + 0.051*"learning" + 0.020*"labeled" + 0.019*"supervised" + 0.016*"examples" + 0.014*"label" + 0.013*"performance" + 0.012*"unlabeled" + 0.012*"labels" + 0.012*"instances"
topic #63 (0.010): 0.089*"lexical" + 0.058*"lexicon" + 0.045*"dictionary" + 0.029*"entries" + 0.022*"entry" + 0.018*"dictionaries" + 0.012*"definition" + 0.011*"definitions" + 0.010*"lex" + 0.009*"lexicons"
topic #2 (0.010): 0.075*"segmentation" + 0.054*"segment" + 0.043*"segments" + 0.030*"units" + 0.030*"boundaries" + 0.027*"

topic #11 (0.010): 0.025*"resolution" + 0.023*"pronoun" + 0.021*"discourse" + 0.021*"anaphora" + 0.020*"antecedent" + 0.019*"pronouns" + 0.013*"reference" + 0.011*"definite" + 0.011*"anaphoric" + 0.009*"antecedents"
topic #43 (0.010): 0.032*"german" + 0.027*"clause" + 0.021*"verb" + 0.017*"case" + 0.016*"order" + 0.013*"clauses" + 0.013*"subject" + 0.008*"syntactic" + 0.008*"structure" + 0.007*"der"
topic #55 (0.010): 0.076*"error" + 0.072*"errors" + 0.030*"correction" + 0.021*"correct" + 0.013*"spelling" + 0.009*"edit" + 0.009*"detection" + 0.009*"incorrect" + 0.008*"english" + 0.007*"corrections"
topic #78 (0.010): 0.041*"patterns" + 0.036*"extraction" + 0.034*"precision" + 0.027*"pattern" + 0.025*"recall" + 0.020*"extracted" + 0.018*"corpus" + 0.012*"method" + 0.011*"extract" + 0.010*"table"
topic diff=0.047897, rho=0.150410
-8.074 per-word bound, 269.5 perplexity estimate based on a held-out corpus of 2000 documents with 4259383 words
PROGRESS: pass 32, at document #20000/22405
mer

topic #87 (0.010): 0.057*"action" + 0.044*"state" + 0.041*"actions" + 0.021*"learning" + 0.018*"game" + 0.012*"robot" + 0.012*"states" + 0.012*"policy" + 0.012*"instructions" + 0.011*"simulation"
topic #13 (0.010): 0.032*"speech" + 0.029*"prosodic" + 0.022*"cue" + 0.019*"pitch" + 0.019*"cues" + 0.014*"accent" + 0.014*"phrase" + 0.012*"prosody" + 0.012*"speaker" + 0.010*"tone"
topic diff=0.042738, rho=0.148737
PROGRESS: pass 33, at document #10000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #12 (0.010): 0.028*"logic" + 0.021*"logical" + 0.014*"scope" + 0.013*"variables" + 0.012*"semantics" + 0.012*"form" + 0.012*"variable" + 0.011*"order" + 0.011*"formula" + 0.010*"type"
topic #92 (0.010): 0.066*"dialogue" + 0.024*"utterance" + 0.023*"utterances" + 0.019*"dialog" + 0.013*"spoken" + 0.012*"dialogues" + 0.012*"speech" + 0.012*"task" + 0.010*"turn" + 0.010*"act"
topic #53 (0.010): 0.161*"discourse" + 0.045*"relations" + 0.031*"relation" + 0.022*"structur

topic diff=0.061470, rho=0.148737
-8.122 per-word bound, 278.6 perplexity estimate based on a held-out corpus of 405 documents with 841110 words
PROGRESS: pass 33, at document #22405/22405
merging changes from 405 documents into a model of 22405 documents
topic #83 (0.010): 0.270*"rules" + 0.209*"rule" + 0.015*"applied" + 0.014*"transformation" + 0.010*"hand" + 0.009*"grammar" + 0.009*"application" + 0.009*"apply" + 0.009*"transformations" + 0.008*"context"
topic #20 (0.010): 0.029*"template" + 0.022*"slot" + 0.021*"templates" + 0.019*"muc" + 0.016*"text" + 0.013*"slots" + 0.008*"message" + 0.007*"systems" + 0.007*"type" + 0.007*"task"
topic #75 (0.010): 0.029*"input" + 0.015*"time" + 0.015*"processing" + 0.012*"parser" + 0.012*"process" + 0.010*"output" + 0.009*"parsing" + 0.009*"case" + 0.007*"sentence" + 0.007*"current"
topic #1 (0.010): 0.006*"year" + 0.006*"time" + 0.006*"news" + 0.006*"research" + 0.005*"people" + 0.005*"university" + 0.005*"government" + 0.004*"science" + 0.004*

topic #0 (0.010): 0.024*"czech" + 0.022*"languages" + 0.017*"lemma" + 0.013*"russian" + 0.013*"sr" + 0.012*"swedish" + 0.009*"prague" + 0.007*"lemmas" + 0.007*"case" + 0.007*"hungarian"
topic #71 (0.010): 0.056*"coreference" + 0.035*"mentions" + 0.034*"mention" + 0.027*"resolution" + 0.013*"entity" + 0.011*"chain" + 0.011*"chains" + 0.010*"ng" + 0.009*"ace" + 0.009*"muc"
topic #40 (0.010): 0.087*"english" + 0.078*"translation" + 0.048*"languages" + 0.024*"bilingual" + 0.024*"parallel" + 0.022*"translations" + 0.020*"spanish" + 0.018*"corpora" + 0.014*"source" + 0.013*"cross"
topic #30 (0.010): 0.091*"annotation" + 0.031*"annotated" + 0.031*"annotations" + 0.016*"scheme" + 0.014*"corpus" + 0.013*"xml" + 0.011*"id" + 0.011*"linguistic" + 0.009*"resources" + 0.009*"tool"
topic diff=0.075010, rho=0.147118
PROGRESS: pass 34, at document #16000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #85 (0.010): 0.051*"knowledge" + 0.047*"concept" + 0.041*"concepts" +

topic #88 (0.010): 0.102*"verb" + 0.074*"verbs" + 0.013*"object" + 0.013*"subject" + 0.013*"classes" + 0.011*"metaphor" + 0.009*"selectional" + 0.009*"semantic" + 0.007*"class" + 0.007*"syntactic"
topic #37 (0.010): 0.050*"evaluation" + 0.027*"task" + 0.025*"systems" + 0.020*"human" + 0.016*"metrics" + 0.016*"test" + 0.016*"scores" + 0.013*"score" + 0.012*"metric" + 0.012*"quality"
topic #93 (0.010): 0.101*"similarity" + 0.042*"pairs" + 0.030*"measure" + 0.029*"distance" + 0.026*"measures" + 0.022*"pair" + 0.018*"similar" + 0.011*"method" + 0.008*"score" + 0.008*"methods"
topic diff=0.067932, rho=0.145552
PROGRESS: pass 35, at document #6000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #39 (0.010): 0.068*"chinese" + 0.014*"segmentation" + 0.014*"word" + 0.012*"table" + 0.011*"method" + 0.010*"character" + 0.009*"corpus" + 0.008*"korean" + 0.008*"performance" + 0.007*"proposed"
topic #51 (0.010): 0.075*"entity" + 0.046*"entities" + 0.038*"named" + 0.02

topic #90 (0.010): 0.052*"wordnet" + 0.037*"semantic" + 0.015*"lexical" + 0.012*"resources" + 0.011*"wikipedia" + 0.010*"synsets" + 0.010*"relations" + 0.010*"synset" + 0.009*"synonyms" + 0.008*"knowledge"
topic diff=0.046067, rho=0.145552
-8.074 per-word bound, 269.4 perplexity estimate based on a held-out corpus of 2000 documents with 4259383 words
PROGRESS: pass 35, at document #20000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #64 (0.010): 0.089*"frame" + 0.048*"frames" + 0.029*"semantic" + 0.025*"framenet" + 0.017*"roles" + 0.015*"role" + 0.011*"theme" + 0.009*"elements" + 0.009*"agent" + 0.008*"fillmore"
topic #47 (0.010): 0.039*"model" + 0.021*"models" + 0.019*"sequence" + 0.016*"crf" + 0.013*"conditional" + 0.012*"hmm" + 0.011*"training" + 0.010*"unsupervised" + 0.010*"state" + 0.010*"joint"
topic #88 (0.010): 0.104*"verb" + 0.071*"verbs" + 0.015*"object" + 0.014*"subject" + 0.013*"classes" + 0.010*"selectional" + 0.009*"metaphor" + 0.008*"se

topic diff=0.041275, rho=0.144034
PROGRESS: pass 36, at document #10000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #89 (0.010): 0.065*"emotion" + 0.051*"detection" + 0.038*"fragments" + 0.035*"fragment" + 0.024*"emotions" + 0.023*"repair" + 0.015*"emotional" + 0.015*"repairs" + 0.014*"thai" + 0.014*"fp"
topic #90 (0.010): 0.047*"wordnet" + 0.037*"semantic" + 0.014*"lexical" + 0.013*"wikipedia" + 0.012*"resources" + 0.010*"relations" + 0.010*"synsets" + 0.009*"synonyms" + 0.009*"synset" + 0.008*"hypernym"
topic #39 (0.010): 0.067*"chinese" + 0.014*"segmentation" + 0.014*"word" + 0.012*"table" + 0.011*"method" + 0.010*"character" + 0.008*"corpus" + 0.008*"performance" + 0.007*"korean" + 0.007*"proposed"
topic #58 (0.010): 0.041*"character" + 0.029*"characters" + 0.022*"transliteration" + 0.019*"code" + 0.018*"english" + 0.012*"letter" + 0.011*"names" + 0.010*"table" + 0.009*"string" + 0.009*"dictionary"
topic #98 (0.010): 0.059*"score" + 0.055*"candid

PROGRESS: pass 36, at document #22405/22405
merging changes from 405 documents into a model of 22405 documents
topic #26 (0.010): 0.130*"np" + 0.049*"category" + 0.046*"vp" + 0.035*"categories" + 0.020*"pp" + 0.011*"type" + 0.011*"grammar" + 0.011*"phrase" + 0.010*"structure" + 0.010*"verb"
topic #85 (0.010): 0.044*"knowledge" + 0.042*"concept" + 0.038*"concepts" + 0.030*"ontology" + 0.015*"terms" + 0.014*"domain" + 0.012*"medical" + 0.012*"conceptual" + 0.011*"base" + 0.011*"patient"
topic #25 (0.010): 0.025*"string" + 0.024*"state" + 0.021*"finite" + 0.015*"strings" + 0.010*"let" + 0.009*"states" + 0.009*"context" + 0.008*"symbol" + 0.008*"regular" + 0.008*"symbols"
topic #64 (0.010): 0.079*"frame" + 0.046*"frames" + 0.030*"framenet" + 0.028*"semantic" + 0.016*"roles" + 0.014*"role" + 0.010*"theme" + 0.009*"fillmore" + 0.009*"elements" + 0.008*"semantics"
topic #90 (0.010): 0.047*"wordnet" + 0.038*"semantic" + 0.015*"lexical" + 0.013*"resources" + 0.011*"wikipedia" + 0.011*"relations

topic #40 (0.010): 0.086*"english" + 0.081*"translation" + 0.048*"languages" + 0.024*"bilingual" + 0.024*"parallel" + 0.022*"translations" + 0.020*"spanish" + 0.017*"corpora" + 0.015*"source" + 0.013*"machine"
topic #35 (0.010): 0.043*"agreement" + 0.033*"annotators" + 0.023*"annotation" + 0.021*"annotator" + 0.011*"annotations" + 0.009*"inter" + 0.009*"annotated" + 0.009*"hindi" + 0.008*"kappa" + 0.007*"table"
topic #58 (0.010): 0.041*"character" + 0.028*"characters" + 0.020*"transliteration" + 0.019*"english" + 0.017*"code" + 0.014*"letter" + 0.012*"names" + 0.010*"table" + 0.009*"string" + 0.009*"dictionary"
topic diff=0.072475, rho=0.142563
PROGRESS: pass 37, at document #16000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #17 (0.010): 0.192*"event" + 0.101*"events" + 0.026*"trigger" + 0.022*"story" + 0.014*"extraction" + 0.013*"stories" + 0.011*"detection" + 0.011*"news" + 0.010*"type" + 0.010*"task"
topic #77 (0.010): 0.019*"vowel" + 0.017*"phone

topic #51 (0.010): 0.074*"entity" + 0.046*"entities" + 0.038*"named" + 0.023*"names" + 0.019*"ne" + 0.018*"wikipedia" + 0.017*"person" + 0.016*"ner" + 0.012*"recognition" + 0.011*"extraction"
topic #58 (0.010): 0.037*"character" + 0.025*"characters" + 0.022*"transliteration" + 0.020*"code" + 0.019*"english" + 0.012*"letter" + 0.011*"names" + 0.010*"table" + 0.008*"dictionary" + 0.008*"identification"
topic diff=0.065656, rho=0.141136
PROGRESS: pass 38, at document #6000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #21 (0.010): 0.021*"algorithm" + 0.017*"function" + 0.012*"problem" + 0.011*"constraints" + 0.009*"linear" + 0.008*"weight" + 0.007*"algorithms" + 0.007*"weights" + 0.007*"xi" + 0.006*"optimization"
topic #73 (0.010): 0.021*"tion" + 0.019*"ing" + 0.009*"pro" + 0.008*"ex" + 0.008*"com" + 0.007*"tions" + 0.006*"tile" + 0.005*"dis" + 0.005*"ap" + 0.004*"lan"
topic #27 (0.010): 0.075*"temporal" + 0.057*"time" + 0.024*"tense" + 0.021*"event" + 0.

topic diff=0.044330, rho=0.141136
-8.073 per-word bound, 269.3 perplexity estimate based on a held-out corpus of 2000 documents with 4259383 words
PROGRESS: pass 38, at document #20000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #71 (0.010): 0.062*"coreference" + 0.034*"mentions" + 0.033*"mention" + 0.028*"resolution" + 0.013*"entity" + 0.013*"chains" + 0.013*"chain" + 0.011*"ace" + 0.009*"ng" + 0.009*"muc"
topic #80 (0.010): 0.065*"grammar" + 0.025*"feature" + 0.019*"structure" + 0.016*"grammars" + 0.016*"constraints" + 0.014*"unification" + 0.014*"structures" + 0.010*"constraint" + 0.010*"lexical" + 0.009*"head"
topic #41 (0.010): 0.035*"french" + 0.022*"la" + 0.013*"le" + 0.010*"fr" + 0.009*"translation" + 0.008*"des" + 0.007*"france" + 0.007*"linguistic" + 0.006*"order" + 0.006*"du"
topic #79 (0.010): 0.091*"topic" + 0.036*"topics" + 0.034*"model" + 0.018*"distribution" + 0.017*"models" + 0.016*"lda" + 0.013*"latent" + 0.012*"document" + 0.010*"d

topic #78 (0.010): 0.040*"patterns" + 0.036*"extraction" + 0.033*"precision" + 0.026*"pattern" + 0.024*"recall" + 0.020*"extracted" + 0.018*"corpus" + 0.012*"method" + 0.011*"extract" + 0.010*"seed"
topic #40 (0.010): 0.082*"english" + 0.081*"translation" + 0.048*"languages" + 0.025*"bilingual" + 0.024*"parallel" + 0.022*"translations" + 0.019*"spanish" + 0.018*"corpora" + 0.015*"source" + 0.013*"machine"
topic #46 (0.010): 0.100*"graph" + 0.045*"nodes" + 0.037*"node" + 0.037*"edge" + 0.037*"edges" + 0.032*"path" + 0.024*"graphs" + 0.019*"figure" + 0.017*"algorithm" + 0.017*"paths"
topic #18 (0.010): 0.089*"clause" + 0.049*"clauses" + 0.025*"markers" + 0.024*"marker" + 0.023*"syntactic" + 0.018*"coordination" + 0.018*"style" + 0.015*"hebrew" + 0.014*"conjunction" + 0.012*"relative"
topic diff=0.051552, rho=0.139751
PROGRESS: pass 39, at document #12000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #81 (0.010): 0.041*"ion" + 0.024*"ing" + 0.014*"le" + 0

topic #55 (0.010): 0.069*"errors" + 0.068*"error" + 0.032*"correction" + 0.020*"correct" + 0.015*"spelling" + 0.010*"detection" + 0.008*"native" + 0.008*"edit" + 0.008*"incorrect" + 0.008*"english"
topic #37 (0.010): 0.050*"evaluation" + 0.030*"task" + 0.027*"systems" + 0.019*"human" + 0.017*"test" + 0.016*"scores" + 0.015*"metrics" + 0.013*"score" + 0.012*"metric" + 0.011*"quality"
topic #73 (0.010): 0.021*"tion" + 0.019*"ing" + 0.009*"pro" + 0.008*"ex" + 0.008*"com" + 0.007*"tions" + 0.006*"tile" + 0.005*"dis" + 0.005*"ap" + 0.005*"lan"
topic diff=0.071896, rho=0.139751
PROGRESS: pass 40, at document #2000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #12 (0.010): 0.028*"logic" + 0.021*"logical" + 0.014*"semantics" + 0.013*"scope" + 0.012*"variables" + 0.011*"variable" + 0.011*"form" + 0.011*"order" + 0.010*"type" + 0.009*"formula"
topic #7 (0.010): 0.099*"noun" + 0.050*"phrase" + 0.036*"phrases" + 0.034*"nouns" + 0.027*"verb" + 0.021*"head" + 0.019*

topic #22 (0.010): 0.050*"sentiment" + 0.029*"opinion" + 0.027*"negative" + 0.026*"positive" + 0.025*"polarity" + 0.018*"reviews" + 0.016*"analysis" + 0.011*"subjective" + 0.011*"classification" + 0.010*"review"
topic diff=0.070193, rho=0.138406
PROGRESS: pass 40, at document #16000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #46 (0.010): 0.099*"graph" + 0.047*"nodes" + 0.039*"node" + 0.037*"edge" + 0.036*"edges" + 0.033*"path" + 0.024*"graphs" + 0.019*"figure" + 0.017*"paths" + 0.017*"algorithm"
topic #12 (0.010): 0.026*"logic" + 0.022*"logical" + 0.013*"scope" + 0.013*"semantics" + 0.013*"variables" + 0.012*"form" + 0.012*"variable" + 0.011*"formula" + 0.010*"order" + 0.009*"predicate"
topic #64 (0.010): 0.093*"frame" + 0.050*"frames" + 0.029*"semantic" + 0.026*"framenet" + 0.018*"roles" + 0.016*"role" + 0.012*"theme" + 0.010*"agent" + 0.009*"elements" + 0.009*"fillmore"
topic #68 (0.010): 0.046*"kernel" + 0.033*"svm" + 0.020*"kernels" + 0.013*"tre

topic diff=0.063631, rho=0.137099
PROGRESS: pass 41, at document #6000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #95 (0.010): 0.066*"clustering" + 0.059*"cluster" + 0.053*"clusters" + 0.023*"algorithm" + 0.009*"unsupervised" + 0.008*"method" + 0.007*"hierarchical" + 0.007*"means" + 0.007*"algorithms" + 0.006*"evaluation"
topic #91 (0.010): 0.207*"model" + 0.086*"models" + 0.025*"gram" + 0.022*"training" + 0.014*"modeling" + 0.011*"context" + 0.010*"lm" + 0.009*"bigram" + 0.009*"probability" + 0.008*"wi"
topic #77 (0.010): 0.018*"syllable" + 0.017*"vowel" + 0.016*"phonological" + 0.016*"languages" + 0.015*"phoneme" + 0.014*"stress" + 0.012*"phonetic" + 0.011*"consonant" + 0.010*"dialect" + 0.010*"vowels"
topic #31 (0.010): 0.294*"word" + 0.229*"words" + 0.014*"context" + 0.008*"list" + 0.007*"frequency" + 0.007*"unknown" + 0.007*"vocabulary" + 0.006*"speech" + 0.006*"corpus" + 0.005*"text"
topic #64 (0.010): 0.083*"frame" + 0.047*"frames" + 0.029*"f

topic #39 (0.010): 0.071*"chinese" + 0.016*"segmentation" + 0.013*"word" + 0.012*"table" + 0.010*"character" + 0.010*"method" + 0.009*"corpus" + 0.008*"performance" + 0.007*"korean" + 0.007*"processing"
topic #29 (0.010): 0.035*"vector" + 0.028*"vectors" + 0.027*"space" + 0.023*"matrix" + 0.021*"context" + 0.014*"distributional" + 0.014*"semantic" + 0.012*"models" + 0.010*"model" + 0.009*"lsa"
topic #24 (0.010): 0.037*"summarization" + 0.031*"summary" + 0.027*"summaries" + 0.021*"document" + 0.016*"sentences" + 0.015*"sentence" + 0.012*"text" + 0.011*"content" + 0.010*"rouge" + 0.010*"compression"
topic #93 (0.010): 0.099*"similarity" + 0.042*"pairs" + 0.032*"distance" + 0.031*"measure" + 0.025*"measures" + 0.022*"pair" + 0.019*"similar" + 0.011*"method" + 0.008*"methods" + 0.008*"score"
topic diff=0.039446, rho=0.137099
PROGRESS: pass 41, at document #22000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #38 (0.010): 0.122*"fi" + 0.115*"fl" + 0.108*"ff"

topic #98 (0.010): 0.059*"score" + 0.053*"candidate" + 0.042*"best" + 0.033*"ranking" + 0.029*"candidates" + 0.028*"scores" + 0.022*"list" + 0.021*"rank" + 0.018*"method" + 0.012*"confidence"
topic #19 (0.010): 0.088*"alignment" + 0.034*"word" + 0.029*"alignments" + 0.024*"aligned" + 0.017*"pairs" + 0.013*"pair" + 0.013*"target" + 0.013*"source" + 0.012*"model" + 0.011*"sentence"
topic #71 (0.010): 0.056*"coreference" + 0.036*"mentions" + 0.036*"mention" + 0.028*"resolution" + 0.014*"entity" + 0.012*"ace" + 0.012*"chain" + 0.011*"chains" + 0.009*"ng" + 0.008*"model"
topic diff=0.050041, rho=0.135828
PROGRESS: pass 42, at document #12000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #76 (0.010): 0.091*"sense" + 0.043*"senses" + 0.036*"disambiguation" + 0.033*"word" + 0.018*"wsd" + 0.015*"context" + 0.010*"corpus" + 0.010*"target" + 0.010*"words" + 0.009*"ambiguous"
topic #12 (0.010): 0.028*"logic" + 0.021*"logical" + 0.015*"scope" + 0.014*"semantics" + 

topic #72 (0.010): 0.060*"translation" + 0.023*"phrase" + 0.020*"machine" + 0.017*"bleu" + 0.015*"source" + 0.014*"statistical" + 0.011*"smt" + 0.010*"english" + 0.010*"training" + 0.010*"reordering"
topic #58 (0.010): 0.037*"character" + 0.025*"characters" + 0.022*"code" + 0.019*"english" + 0.019*"transliteration" + 0.011*"names" + 0.011*"letter" + 0.010*"table" + 0.009*"switching" + 0.009*"identification"
topic diff=0.069929, rho=0.135828
PROGRESS: pass 43, at document #2000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #65 (0.010): 0.094*"features" + 0.052*"feature" + 0.023*"classification" + 0.021*"classifier" + 0.016*"training" + 0.014*"table" + 0.014*"accuracy" + 0.013*"task" + 0.012*"performance" + 0.011*"learning"
topic #40 (0.010): 0.084*"english" + 0.080*"translation" + 0.051*"languages" + 0.024*"parallel" + 0.023*"spanish" + 0.022*"translations" + 0.022*"bilingual" + 0.018*"corpora" + 0.015*"source" + 0.014*"machine"
topic #89 (0.010): 0.066

topic diff=0.068168, rho=0.134592
PROGRESS: pass 43, at document #16000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #96 (0.010): 0.230*"sentence" + 0.170*"sentences" + 0.020*"paraphrase" + 0.019*"paraphrases" + 0.011*"text" + 0.009*"original" + 0.008*"paraphrasing" + 0.008*"simple" + 0.008*"phrases" + 0.007*"simplification"
topic #99 (0.010): 0.021*"biomedical" + 0.017*"scope" + 0.016*"negation" + 0.015*"protein" + 0.015*"task" + 0.012*"cue" + 0.012*"gene" + 0.010*"abstracts" + 0.010*"clinical" + 0.009*"medical"
topic #51 (0.010): 0.076*"entity" + 0.046*"entities" + 0.036*"named" + 0.025*"names" + 0.020*"ne" + 0.018*"person" + 0.016*"wikipedia" + 0.015*"ner" + 0.012*"recognition" + 0.011*"extraction"
topic #70 (0.010): 0.173*"semantic" + 0.055*"syntactic" + 0.037*"representation" + 0.028*"semantics" + 0.025*"meaning" + 0.017*"natural" + 0.016*"representations" + 0.014*"syntax" + 0.011*"parsing" + 0.011*"form"
topic #79 (0.010): 0.086*"topic" + 0.034*

topic #13 (0.010): 0.033*"speech" + 0.030*"prosodic" + 0.021*"cue" + 0.020*"pitch" + 0.019*"cues" + 0.014*"accent" + 0.014*"phrase" + 0.013*"speaker" + 0.013*"prosody" + 0.010*"duration"
topic #69 (0.010): 0.104*"dependency" + 0.047*"parsing" + 0.021*"parser" + 0.015*"head" + 0.015*"dependencies" + 0.011*"nivre" + 0.010*"parsers" + 0.009*"projective" + 0.008*"mcdonald" + 0.008*"pos"
topic #83 (0.010): 0.269*"rules" + 0.212*"rule" + 0.015*"applied" + 0.014*"transformation" + 0.010*"hand" + 0.010*"grammar" + 0.009*"application" + 0.009*"transformations" + 0.009*"apply" + 0.008*"context"
topic #33 (0.010): 0.101*"user" + 0.030*"users" + 0.012*"figure" + 0.010*"interface" + 0.009*"interaction" + 0.008*"time" + 0.008*"interactive" + 0.007*"systems" + 0.005*"input" + 0.004*"feedback"
topic #30 (0.010): 0.086*"annotation" + 0.028*"annotations" + 0.028*"annotated" + 0.014*"xml" + 0.014*"scheme" + 0.013*"corpus" + 0.011*"id" + 0.011*"linguistic" + 0.010*"resources" + 0.009*"tools"
topic diff=0.

topic #19 (0.010): 0.087*"alignment" + 0.033*"word" + 0.028*"alignments" + 0.024*"aligned" + 0.017*"pairs" + 0.013*"pair" + 0.012*"source" + 0.012*"target" + 0.012*"model" + 0.011*"sentence"
topic #10 (0.010): 0.103*"arabic" + 0.014*"morphological" + 0.011*"habash" + 0.009*"basque" + 0.009*"standard" + 0.008*"dialects" + 0.008*"resources" + 0.008*"tokenization" + 0.008*"languages" + 0.007*"dialectal"
topic #59 (0.010): 0.044*"students" + 0.036*"student" + 0.020*"reading" + 0.017*"responses" + 0.016*"learning" + 0.015*"response" + 0.012*"course" + 0.011*"readability" + 0.011*"level" + 0.011*"essay"
topic diff=0.038202, rho=0.133390
PROGRESS: pass 44, at document #22000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #92 (0.010): 0.067*"dialogue" + 0.022*"utterance" + 0.021*"utterances" + 0.019*"dialog" + 0.014*"spoken" + 0.012*"task" + 0.012*"speech" + 0.011*"dialogues" + 0.010*"turn" + 0.009*"act"
topic #7 (0.010): 0.100*"noun" + 0.054*"phrase" + 0.039*"

topic #10 (0.010): 0.101*"arabic" + 0.013*"morphological" + 0.012*"habash" + 0.010*"dialects" + 0.009*"standard" + 0.009*"da" + 0.009*"dialectal" + 0.008*"resources" + 0.007*"languages" + 0.007*"basque"
topic diff=0.048647, rho=0.132219
PROGRESS: pass 45, at document #12000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #56 (0.010): 0.030*"ii" + 0.023*"ti" + 0.022*"si" + 0.018*"es" + 0.017*"el" + 0.016*"di" + 0.016*"en" + 0.016*"er" + 0.014*"ct" + 0.014*"sp"
topic #29 (0.010): 0.034*"vector" + 0.028*"vectors" + 0.026*"space" + 0.022*"matrix" + 0.020*"context" + 0.015*"distributional" + 0.014*"semantic" + 0.012*"models" + 0.010*"model" + 0.008*"lsa"
topic #45 (0.010): 0.072*"generation" + 0.015*"generated" + 0.013*"referring" + 0.011*"expressions" + 0.011*"generating" + 0.011*"natural" + 0.010*"generator" + 0.010*"generate" + 0.010*"nlg" + 0.009*"algorithm"
topic #18 (0.010): 0.100*"clause" + 0.057*"clauses" + 0.026*"syntactic" + 0.024*"marker" + 0.023*"

topic diff=0.068113, rho=0.132219
PROGRESS: pass 46, at document #2000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #27 (0.010): 0.076*"temporal" + 0.056*"time" + 0.024*"tense" + 0.021*"event" + 0.018*"expressions" + 0.016*"events" + 0.014*"past" + 0.011*"aspect" + 0.010*"date" + 0.009*"interval"
topic #84 (0.010): 0.060*"japanese" + 0.040*"method" + 0.019*"case" + 0.014*"proposed" + 0.013*"figure" + 0.011*"table" + 0.010*"japan" + 0.008*"jp" + 0.008*"ga" + 0.008*"shows"
topic #76 (0.010): 0.093*"sense" + 0.043*"senses" + 0.034*"disambiguation" + 0.033*"word" + 0.018*"wsd" + 0.015*"context" + 0.011*"corpus" + 0.011*"target" + 0.009*"words" + 0.009*"task"
topic #7 (0.010): 0.100*"noun" + 0.050*"phrase" + 0.037*"phrases" + 0.034*"nouns" + 0.026*"verb" + 0.021*"head" + 0.019*"compound" + 0.016*"adjectives" + 0.016*"adjective" + 0.015*"compounds"
topic #73 (0.010): 0.021*"tion" + 0.019*"ing" + 0.009*"pro" + 0.008*"ex" + 0.008*"com" + 0.008*"tions" + 0.006

topic #74 (0.010): 0.029*"learning" + 0.025*"model" + 0.021*"acquisition" + 0.017*"memory" + 0.017*"children" + 0.016*"cognitive" + 0.015*"processing" + 0.015*"child" + 0.013*"learner" + 0.011*"linguistic"
topic #0 (0.010): 0.027*"czech" + 0.022*"languages" + 0.017*"lemma" + 0.014*"russian" + 0.013*"sr" + 0.011*"swedish" + 0.009*"prague" + 0.008*"lemmas" + 0.007*"case" + 0.007*"hungarian"
topic #78 (0.010): 0.040*"patterns" + 0.035*"extraction" + 0.034*"precision" + 0.026*"pattern" + 0.026*"recall" + 0.020*"extracted" + 0.018*"corpus" + 0.012*"method" + 0.011*"extract" + 0.010*"table"
topic diff=0.055090, rho=0.131078
PROGRESS: pass 46, at document #18000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #11 (0.010): 0.025*"resolution" + 0.024*"pronoun" + 0.021*"discourse" + 0.021*"anaphora" + 0.020*"antecedent" + 0.020*"pronouns" + 0.013*"reference" + 0.011*"definite" + 0.011*"anaphoric" + 0.009*"antecedents"
topic #68 (0.010): 0.045*"kernel" + 0.034*"svm

topic #65 (0.010): 0.098*"features" + 0.053*"feature" + 0.024*"classification" + 0.021*"classifier" + 0.015*"training" + 0.014*"table" + 0.014*"accuracy" + 0.013*"task" + 0.012*"performance" + 0.011*"learning"
topic #95 (0.010): 0.066*"clustering" + 0.059*"cluster" + 0.053*"clusters" + 0.023*"algorithm" + 0.009*"unsupervised" + 0.008*"method" + 0.007*"hierarchical" + 0.007*"means" + 0.007*"algorithms" + 0.006*"evaluation"
topic diff=0.061089, rho=0.129966
PROGRESS: pass 47, at document #8000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #19 (0.010): 0.089*"alignment" + 0.034*"word" + 0.029*"alignments" + 0.024*"aligned" + 0.017*"pairs" + 0.013*"pair" + 0.013*"source" + 0.012*"target" + 0.012*"model" + 0.011*"sentence"
topic #48 (0.010): 0.024*"plan" + 0.020*"agent" + 0.019*"speaker" + 0.015*"goal" + 0.013*"act" + 0.013*"model" + 0.011*"belief" + 0.011*"action" + 0.010*"discourse" + 0.010*"goals"
topic #80 (0.010): 0.059*"grammar" + 0.026*"feature" + 0.

topic diff=0.037057, rho=0.129966
PROGRESS: pass 47, at document #22000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #67 (0.010): 0.041*"text" + 0.021*"structure" + 0.010*"level" + 0.010*"knowledge" + 0.008*"content" + 0.008*"generation" + 0.006*"process" + 0.006*"propositions" + 0.006*"figure" + 0.006*"message"
topic #21 (0.010): 0.021*"algorithm" + 0.018*"function" + 0.012*"problem" + 0.011*"constraints" + 0.009*"linear" + 0.008*"weight" + 0.007*"algorithms" + 0.007*"weights" + 0.007*"xi" + 0.007*"optimization"
topic #24 (0.010): 0.037*"summarization" + 0.031*"summary" + 0.028*"summaries" + 0.021*"document" + 0.015*"sentences" + 0.015*"sentence" + 0.012*"text" + 0.011*"rouge" + 0.010*"content" + 0.009*"compression"
topic #80 (0.010): 0.064*"grammar" + 0.025*"feature" + 0.019*"structure" + 0.016*"grammars" + 0.016*"constraints" + 0.014*"unification" + 0.014*"structures" + 0.010*"constraint" + 0.010*"lexical" + 0.009*"head"
topic #20 (0.010): 0.028*"t

topic #47 (0.010): 0.040*"model" + 0.020*"models" + 0.017*"sequence" + 0.014*"crf" + 0.011*"conditional" + 0.011*"joint" + 0.010*"training" + 0.010*"unsupervised" + 0.010*"hmm" + 0.010*"features"
topic #1 (0.010): 0.006*"year" + 0.006*"news" + 0.006*"time" + 0.006*"people" + 0.006*"research" + 0.005*"university" + 0.005*"government" + 0.004*"science" + 0.004*"national" + 0.004*"report"
topic #19 (0.010): 0.085*"alignment" + 0.034*"word" + 0.029*"alignments" + 0.023*"aligned" + 0.017*"pairs" + 0.013*"pair" + 0.012*"target" + 0.012*"source" + 0.012*"model" + 0.011*"sentence"
topic #78 (0.010): 0.039*"patterns" + 0.036*"extraction" + 0.034*"precision" + 0.025*"recall" + 0.025*"pattern" + 0.020*"extracted" + 0.018*"corpus" + 0.012*"method" + 0.011*"extract" + 0.010*"table"
topic diff=0.038122, rho=0.128882
PROGRESS: pass 48, at document #14000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #2 (0.010): 0.069*"segmentation" + 0.053*"segment" + 0.039*"segments

topic #82 (0.010): 0.077*"pos" + 0.051*"tag" + 0.048*"tags" + 0.040*"tagging" + 0.027*"tagger" + 0.021*"speech" + 0.020*"corpus" + 0.016*"tagged" + 0.015*"msa" + 0.013*"chunk"
topic #71 (0.010): 0.061*"coreference" + 0.036*"mentions" + 0.034*"mention" + 0.028*"resolution" + 0.013*"entity" + 0.013*"chain" + 0.013*"chains" + 0.011*"ace" + 0.009*"muc" + 0.009*"ng"
topic #73 (0.010): 0.021*"tion" + 0.019*"ing" + 0.009*"pro" + 0.008*"ex" + 0.008*"com" + 0.008*"tions" + 0.006*"tile" + 0.005*"dis" + 0.005*"ap" + 0.005*"lan"
topic diff=0.036856, rho=0.127825
PROGRESS: pass 49, at document #4000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #14 (0.010): 0.058*"training" + 0.051*"learning" + 0.020*"labeled" + 0.019*"supervised" + 0.016*"examples" + 0.014*"label" + 0.013*"performance" + 0.012*"unlabeled" + 0.012*"labels" + 0.011*"instances"
topic #8 (0.010): 0.056*"network" + 0.034*"neural" + 0.027*"networks" + 0.022*"layer" + 0.020*"representations" + 0.019*"emb

topic diff=0.053600, rho=0.127825
PROGRESS: pass 49, at document #18000/22405
merging changes from 2000 documents into a model of 22405 documents
topic #99 (0.010): 0.022*"biomedical" + 0.019*"protein" + 0.015*"gene" + 0.015*"scope" + 0.013*"negation" + 0.013*"task" + 0.011*"abstracts" + 0.011*"clinical" + 0.010*"cue" + 0.009*"medical"
topic #19 (0.010): 0.086*"alignment" + 0.034*"word" + 0.029*"alignments" + 0.024*"aligned" + 0.017*"pairs" + 0.013*"pair" + 0.012*"source" + 0.012*"target" + 0.012*"model" + 0.011*"sentence"
topic #55 (0.010): 0.076*"error" + 0.073*"errors" + 0.030*"correction" + 0.021*"correct" + 0.013*"spelling" + 0.009*"edit" + 0.009*"detection" + 0.009*"incorrect" + 0.008*"english" + 0.007*"corrections"
topic #85 (0.010): 0.051*"knowledge" + 0.047*"concept" + 0.041*"concepts" + 0.024*"ontology" + 0.015*"terms" + 0.014*"domain" + 0.014*"conceptual" + 0.013*"base" + 0.011*"medical" + 0.009*"database"
topic #51 (0.010): 0.076*"entity" + 0.046*"entities" + 0.038*"named" 

CPU times: user 9h 33min 12s, sys: 5min 21s, total: 9h 38min 33s
Wall time: 5h 10min 34s
