Skip to content

Commit

Permalink
Improve logging, tidy NMF
Browse files Browse the repository at this point in the history
  • Loading branch information
derekgreene committed Aug 26, 2014
1 parent a4db52f commit f2697a5
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 37 deletions.
11 changes: 9 additions & 2 deletions display-topics.py
@@ -1,4 +1,9 @@
#!/usr/bin/env python
"""
Simple tool to display term rankings generated by NMF/LDA/SKM, stored in one or more PKL
files.
"""
import logging as log
from optparse import OptionParser
import unsupervised.rankings, unsupervised.util

Expand All @@ -7,16 +12,18 @@
def main():
parser = OptionParser(usage="usage: %prog [options] ranking_file1 ranking_file2 ...")
parser.add_option("-t", "--top", action="store", type="int", dest="top", help="number of top terms to show", default=10)
parser.add_option('-d','--debug',type="int",help="Level of log output; 0 is less, 5 is all", default=3)
(options, args) = parser.parse_args()
if( len(args) < 1 ):
parser.error( "Must specify at least one ranking set file" )
log.basicConfig(level=max(50 - (options.debug * 10), 10), format='%(asctime)-18s %(levelname)-10s %(message)s', datefmt='%d/%m/%Y %H:%M',)

# Load each cached ranking set
for in_path in args:
print "Loading terms from %s ..." % in_path
log.info( "Loading terms from %s ..." % in_path )
(term_rankings,labels) = unsupervised.util.load_term_rankings( in_path )
m = unsupervised.rankings.term_rankings_size( term_rankings )
print "Set has %d rankings covering up to %d terms" % ( len(term_rankings), m )
log.info( "Set has %d rankings covering up to %d terms" % ( len(term_rankings), m ) )
print unsupervised.rankings.format_term_rankings( term_rankings, labels, min(options.top,m) )

# --------------------------------------------------------------
Expand Down
23 changes: 13 additions & 10 deletions generate-nmf.py
@@ -1,5 +1,6 @@
#!/usr/bin/env python
import os, sys, random
import logging as log
from optparse import OptionParser
import numpy as np
import text.util, unsupervised.nmf, unsupervised.rankings, unsupervised.util
Expand All @@ -16,9 +17,11 @@ def main():
parser.add_option("-s", "--sample", action="store", type="float", dest="sample_ratio", help="sampling ratio of documents to include in each run (range is 0 to 1)", default=0.8)
parser.add_option("-o","--outdir", action="store", type="string", dest="dir_out", help="base output directory (default is current directory)", default=None)
parser.add_option("-w","--writefactors", action="store_true", dest="write_factors", help="write complete factorization results")
parser.add_option('-d','--debug',type="int",help="Level of log output; 0 is less, 5 is all", default=3)
(options, args) = parser.parse_args()
if len(args) < 1:
parser.error( "Must specify at least one corpus file" )
log.basicConfig(level=max(50 - (options.debug * 10), 10), format='%(asctime)-18s %(levelname)-10s %(message)s', datefmt='%d/%m/%Y %H:%M',)
# use nimfa instead of sklearn?
use_nimfa = True

Expand All @@ -40,22 +43,22 @@ def main():
n_documents = X.shape[0]
n_sample = int( options.sample_ratio * n_documents )
indices = np.arange(n_documents)
print "* Sampling ratio = %.2f - %d/%d documents per run" % ( options.sample_ratio, n_sample, n_documents )

# Generate all NMF topic models for the specified numbers of topics
print "* Running experiments in range k=[%d,%d] max_iters=%d" % ( options.kmin, options.kmax, options.maxiter)
log.info( "Testing models in range k=[%d,%d]" % ( options.kmin, options.kmax ) )
log.info( "Sampling ratio = %.2f - %d/%d documents per run" % ( options.sample_ratio, n_sample, n_documents ) )
for k in range(options.kmin, options.kmax+1):
# Set random state
np.random.seed( options.seed )
random.seed( options.seed )
print "* Applying NMF k=%d runs=%d (%s) ..." % ( k, options.runs, impl.__class__.__name__ )
log.info( "Applying NMF (k=%d, runs=%d, seed=%s - %s) ..." % ( k, options.runs, options.seed, impl.__class__.__name__ ) )
dir_out_k = os.path.join( dir_out_base, "nmf_k%02d" % k )
if not os.path.exists(dir_out_k):
os.makedirs(dir_out_k)
print "Results will be written to %s" % ( dir_out_k )
log.debug( "Results will be written to %s" % dir_out_k )
# Run NMF
for r in range(options.runs):
print "Run %d/%d (seed=%s)" % (r+1, options.runs, options.seed )
log.info( "NMF run %d/%d (k=%d, max_iters=%d)" % (r+1, options.runs, k, options.maxiter ) )
file_suffix = "%s_%03d" % ( options.seed, r+1 )
# sub-sample data
np.random.shuffle(indices)
Expand All @@ -72,23 +75,23 @@ def main():
ranked_term_indices = impl.rank_terms( topic_index )
term_ranking = [terms[i] for i in ranked_term_indices]
term_rankings.append(term_ranking)
print "Generated ranking set with %d topics covering up to %d terms" % ( len(term_rankings), unsupervised.rankings.term_rankings_size( term_rankings ) )
log.debug( "Generated ranking set with %d topics covering up to %d terms" % ( len(term_rankings), unsupervised.rankings.term_rankings_size( term_rankings ) ) )
# Write term rankings
ranks_out_path = os.path.join( dir_out_k, "ranks_%s.pkl" % file_suffix )
print "Writing term ranking set to %s" % ranks_out_path
log.debug( "Writing term ranking set to %s" % ranks_out_path )
unsupervised.util.save_term_rankings( ranks_out_path, term_rankings )
# Write document partition
partition = impl.generate_partition()
partition_out_path = os.path.join( dir_out_k, "partition_%s.pkl" % file_suffix )
print "Writing document partition to %s" % partition_out_path
log.debug( "Writing document partition to %s" % partition_out_path )
unsupervised.util.save_partition( partition_out_path, partition, sample_doc_ids )
# Write the complete factorization?
if options.write_factors:
factor_out_path = os.path.join( dir_out_k, "factors_%s.pkl" % file_suffix )
# NB: need to make a copy of the factors
print "Writing factorization to %s" % factor_out_path
log.debug( "Writing factorization to %s" % factor_out_path )
unsupervised.util.save_nmf_factors( factor_out_path, np.array( impl.W ), np.array( impl.H ), sample_doc_ids )
print "* Done"


# --------------------------------------------------------------

Expand Down
22 changes: 14 additions & 8 deletions parse-text.py
@@ -1,5 +1,6 @@
#!/usr/bin/env python
import os, os.path, sys, codecs, re, unicodedata
import logging as log
from optparse import OptionParser
import text.util

Expand All @@ -9,7 +10,7 @@ def find_documents( root_path ):
"""
Find all files in the specified directory and its subdirectories, and store them as strings in a list.
"""
print "Searching %s for documents ..." % root_path
log.info( "Searching %s for documents ..." % root_path )
filepaths = []
for dir_path, subFolders, files in os.walk(root_path):
for filename in files:
Expand Down Expand Up @@ -48,9 +49,11 @@ def main():
parser.add_option("--norm", action="store_true", dest="apply_norm", help="apply unit length normalization to the document-term matrix")
parser.add_option("--minlen", action="store", type="int", dest="min_doc_length", help="minimum document length (in characters)", default=50)
parser.add_option("-s", action="store", type="string", dest="stoplist_file", help="custom stopword file path", default=None)
parser.add_option('-d','--debug',type="int",help="Level of log output; 0 is less, 5 is all", default=3)
(options, args) = parser.parse_args()
if( len(args) < 1 ):
parser.error( "Must specify at least one directory" )
log.basicConfig(level=max(50 - (options.debug * 10), 10), format='%(asctime)-18s %(levelname)-10s %(message)s', datefmt='%d/%m/%Y %H:%M',)

# Find all relevant files in directories specified by user
filepaths = []
Expand All @@ -63,10 +66,10 @@ def main():
if filename.startswith(".") or filename.startswith("_"):
continue
filepaths.append( in_path )
print "Found %d documents to parse" % len(filepaths)
log.info( "Found %d documents to parse" % len(filepaths) )

# Read the documents
print "Reading documents ..."
log.info( "Reading documents ..." )
docs = []
short_documents = 0
doc_ids = []
Expand All @@ -79,6 +82,7 @@ def main():
if not doc_id.startswith(label):
doc_id = "%s_%s" % ( label, doc_id )
# read body text
log.debug( "Reading text from %s ..." % filepath )
body = read_text( filepath )
if len(body) < options.min_doc_length:
short_documents += 1
Expand All @@ -90,26 +94,28 @@ def main():
label_count[label] = 0
classes[label].add(doc_id)
label_count[label] += 1
print "Kept %d documents. Skipped %d documents with length < %d" % ( len(docs), short_documents, options.min_doc_length )
log.info( "Kept %d documents. Skipped %d documents with length < %d" % ( len(docs), short_documents, options.min_doc_length ) )
if len(classes) < 2:
print "No ground truth available"
log.warning( "No ground truth available" )
classes = None
else:
print "Ground truth: %d classes - %s" % ( len(classes), label_count )
log.info( "Ground truth: %d classes - %s" % ( len(classes), label_count ) )

# Convert the documents in TF-IDF vectors and filter stopwords
if options.stoplist_file is None:
stopwords = text.util.load_stopwords("text/stopwords.txt")
else:
print "Using custom stopwords from", options.stoplist_file
log.info( "Using custom stopwords from", options.stoplist_file )
stopwords = text.util.load_stopwords(options.stoplist_file )
print "Pre-processing data (%d stopwords, tfidf=%s, normalize=%s, min_df=%d) ..." % (len(stopwords), options.apply_tfidf, options.apply_norm, options.min_df)
log.info( "Pre-processing data (%d stopwords, tfidf=%s, normalize=%s, min_df=%d) ..." % (len(stopwords), options.apply_tfidf, options.apply_norm, options.min_df) )
(X,terms) = text.util.preprocess( docs, stopwords, min_df = options.min_df, apply_tfidf = options.apply_tfidf, apply_norm = options.apply_norm )
log.info( "Built matrix: rows: %d, terms: %d" % X.shape )

# Store the corpus
prefix = options.prefix
if prefix is None:
prefix = "corpus"
log.info( "Saving corpus '%s'" % prefix )
text.util.save_corpus( prefix, X, terms, doc_ids, classes )

# --------------------------------------------------------------
Expand Down
28 changes: 17 additions & 11 deletions reference-nmf.py
@@ -1,5 +1,6 @@
#!/usr/bin/env python
import os, sys, random
import logging as log
from optparse import OptionParser
import numpy as np
import text.util, unsupervised.nmf, unsupervised.rankings, unsupervised.util
Expand All @@ -15,9 +16,12 @@ def main():
parser.add_option("--maxiters", action="store", type="int", dest="maxiter", help="maximum number of iterations", default=200)
parser.add_option("-o","--outdir", action="store", type="string", dest="dir_out", help="base output directory (default is current directory)", default=None)
parser.add_option("-w","--writefactors", action="store_true", dest="write_factors", help="write complete factorization results")
parser.add_option('-d','--debug',type="int",help="Level of log output; 0 is less, 5 is all", default=3)
(options, args) = parser.parse_args()
if( len(args) < 1 ):
parser.error( "Must specify at least one corpus file" )
log_level = max(50 - (options.debug * 10), 10)
log.basicConfig(level=log_level, format='%(asctime)-18s %(levelname)-10s %(message)s', datefmt='%d/%m/%Y %H:%M',)
# use nimfa instead of sklearn?
use_nimfa = False

Expand All @@ -32,7 +36,9 @@ def main():

# Load the cached corpus
corpus_path = args[0]
log.info( "Loading corpus from %s ..." % corpus_path )
(X,terms,doc_ids,classes) = text.util.load_corpus( corpus_path )
log.debug( "Read %s document-term matrix, dictionary of %d terms, list of %d document IDs" % ( str(X.shape), len(terms), len(doc_ids) ) )

# Choose implementation
if use_nimfa:
Expand All @@ -41,42 +47,42 @@ def main():
impl = unsupervised.nmf.SklNMF(max_iters = options.maxiter, init_strategy = "nndsvd" )

# Generate reference NMF topic models for the specified numbers of topics
print "* Running reference experiments in range k=[%d,%d] max_iters=%d" % ( options.kmin, options.kmax, options.maxiter )
log.info( "Running reference experiments in range k=[%d,%d] max_iters=%d" % ( options.kmin, options.kmax, options.maxiter ) )
for k in range(options.kmin, options.kmax+1):
print "* Applying NMF k=%d (%s) ..." % ( k, impl.__class__.__name__ )
log.info( "Applying NMF k=%d (%s) ..." % ( k, impl.__class__.__name__ ) )
dir_out_k = os.path.join( dir_out_base, "nmf_k%02d" % k )
if not os.path.exists(dir_out_k):
os.makedirs(dir_out_k)
print "Results will be written to %s" % ( dir_out_k )
impl.apply( X, k )
print "Generated W %s and H %s" % ( str(impl.W.shape), str(impl.H.shape) )
log.debug( "Generated W %s and H %s" % ( str(impl.W.shape), str(impl.H.shape) ) )
# Get term rankings for each topic
term_rankings = []
for topic_index in range(k):
ranked_term_indices = impl.rank_terms( topic_index )
term_ranking = [terms[i] for i in ranked_term_indices]
term_rankings.append(term_ranking)
print "Writing %d rankings covering up to %d terms" % ( len(term_rankings), unsupervised.rankings.term_rankings_size( term_rankings ) )
# Print out the top terms
if options.top > 0:
log.info( "Generated %d rankings covering up to %d terms" % ( len(term_rankings), unsupervised.rankings.term_rankings_size( term_rankings ) ) )
# Print out the top terms, if we want verbose output
if log_level <= 10 and options.top > 0:
print unsupervised.rankings.format_term_rankings( term_rankings, top = options.top )

log.info( "Writing results to %s" % ( dir_out_k ) )
# Write term rankings
ranks_out_path = os.path.join( dir_out_k, "ranks_reference.pkl" )
print "Writing term ranking set to %s" % ranks_out_path
log.debug( "Writing term ranking set to %s" % ranks_out_path )
unsupervised.util.save_term_rankings( ranks_out_path, term_rankings )
# Write document partition
partition = impl.generate_partition()
partition_out_path = os.path.join( dir_out_k, "partition_reference.pkl" )
print "Writing document partition to %s" % partition_out_path
log.debug( "Writing document partition to %s" % partition_out_path )
unsupervised.util.save_partition( partition_out_path, partition, doc_ids )
# Write the complete factorization?
if options.write_factors:
factor_out_path = os.path.join( dir_out_k, "factors_reference.pkl" )
# NB: need to make a copy of the factors
print "Writing complete factorization to %s" % factor_out_path
log.debug( "Writing complete factorization to %s" % factor_out_path )
unsupervised.util.save_nmf_factors( factor_out_path, np.array( impl.W ), np.array( impl.H ), doc_ids )

print "* Done"

# --------------------------------------------------------------

Expand Down
6 changes: 0 additions & 6 deletions text/util.py
Expand Up @@ -19,7 +19,6 @@ def custom_tokenizer( s ):
norm_function = None
tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range)
X = tfidf.fit_transform(docs)
print "Built matrix: rows: %d, terms: %d" % X.shape
terms = []
# store the vocabulary map
v = tfidf.vocabulary_
Expand Down Expand Up @@ -47,18 +46,13 @@ def save_corpus( out_prefix, X, terms, doc_ids, classes ):
Save a pre-processed scikit-learn corpus and associated metadata using Joblib.
"""
matrix_outpath = "%s.pkl" % out_prefix
print "Saving corpus to %s ..." %( matrix_outpath )
joblib.dump((X,terms,doc_ids,classes), matrix_outpath )

def load_corpus( in_path ):
"""
Load a pre-processed scikit-learn corpus and associated metadata using Joblib.
"""
print "Loading corpus from %s ..." % in_path
(X,terms,doc_ids,classes) = joblib.load( in_path )
print "Read %s document-term matrix, dictionary of %d terms, list of %d document IDs" % ( str(X.shape), len(terms), len(doc_ids) )
if not classes is None:
print "Ground truth (%d): %s" % ( len(classes), classes.keys() )
return (X, terms, doc_ids, classes)


60 changes: 60 additions & 0 deletions topic-stability.py
@@ -0,0 +1,60 @@
#!/usr/bin/env python
import os, sys
import logging as log
from optparse import OptionParser
import numpy as np
import unsupervised.util
import unsupervised.rankings

# --------------------------------------------------------------

def main():
parser = OptionParser(usage="usage: %prog [options] reference_rank_file test_rank_file1 test_rank_file2 ...")
parser.add_option("-t", "--top", action="store", type="int", dest="top", help="number of top terms to use", default=20)
parser.add_option('-d','--debug',type="int",help="Level of log output; 0 is less, 5 is all", default=3)

(options, args) = parser.parse_args()
if( len(args) < 2 ):
parser.error( "Must specify at least two ranking sets" )
log.basicConfig(level=max(50 - (options.debug * 10), 10), format='%(asctime)-18s %(levelname)-10s %(message)s', datefmt='%d/%m/%Y %H:%M',)

# Load cached ranking sets
log.info( "Reading %d term ranking sets (top=%d) ..." % ( len(args), options.top ) )
all_term_rankings = []
for rank_path in args:
# first set is the reference set
if len(all_term_rankings) == 0:
log.debug( "Loading reference term ranking set from %s ..." % rank_path )
else:
log.debug( "Loading test term ranking set from %s ..." % rank_path )
(term_rankings,labels) = unsupervised.util.load_term_rankings( rank_path )
log.debug( "Set has %d rankings covering %d terms" % ( len(term_rankings), unsupervised.rankings.term_rankings_size( term_rankings ) ) )
# do we need to truncate the number of terms in the ranking?
if options.top > 1:
term_rankings = unsupervised.rankings.truncate_term_rankings( term_rankings, options.top )
log.debug( "Truncated to %d -> set now has %d rankings covering %d terms" % ( options.top, len(term_rankings), unsupervised.rankings.term_rankings_size( term_rankings ) ) )
all_term_rankings.append( term_rankings )

# First argument was the reference term ranking
reference_term_ranking = all_term_rankings[0]
all_term_rankings = all_term_rankings[1:]
r = len(all_term_rankings)
log.info( "Loaded %d non-reference term rankings" % r )

# Perform the evaluation
metric = unsupervised.rankings.AverageJaccard()
matcher = unsupervised.rankings.RankingSetAgreement( metric )
log.info( "Performing reference comparisons with %s ..." % str(metric) )
all_scores = []
for i in range(r):
score = matcher.similarity( reference_term_ranking, all_term_rankings[i] )
all_scores.append( score )

# Get overall score across all candidates
all_scores = np.array( all_scores )
log.info( "Stability=%.4f [%.4f,%.4f]" % ( all_scores.mean(), all_scores.min(), all_scores.max() ) )

# --------------------------------------------------------------

if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions validate-topics.py
@@ -1,5 +1,6 @@
#!/usr/bin/env python
import os, os.path, sys
import logging as log
from optparse import OptionParser
import text.util, unsupervised.util, unsupervised.validation

Expand All @@ -8,9 +9,11 @@
def main():
parser = OptionParser(usage="usage: %prog [options] corpus_file input_directory1 input_directory2 ...")
parser.add_option("-p", "--precision", action="store", type="int", dest="precision", help="precision for results", default=2)
parser.add_option('-d','--debug',type="int",help="Level of log output; 0 is less, 5 is all", default=3)
(options, args) = parser.parse_args()
if( len(args) < 2 ):
parser.error( "Must specify at least a corpus and one input direct containing topic modeling results" )
log.basicConfig(level=max(50 - (options.debug * 10), 10), format='%(asctime)-18s %(levelname)-10s %(message)s', datefmt='%d/%m/%Y %H:%M',)

# Read the corpus
corpus_path = args[0]
Expand Down

0 comments on commit f2697a5

Please sign in to comment.