Improve logging, tidy NMF

derekgreene · Aug 26, 2014 · f2697a5 · f2697a5
1 parent a4db52f
commit f2697a5
Show file tree

Hide file tree

Showing 7 changed files with 116 additions and 37 deletions.
diff --git a/display-topics.py b/display-topics.py
@@ -1,4 +1,9 @@
 #!/usr/bin/env python
+"""
+Simple tool to display term rankings generated by NMF/LDA/SKM, stored in one or more PKL
+files.
+"""
+import logging as log
 from optparse import OptionParser
 import unsupervised.rankings, unsupervised.util
 
@@ -7,16 +12,18 @@
 def main():
 	parser = OptionParser(usage="usage: %prog [options] ranking_file1 ranking_file2 ...")
 	parser.add_option("-t", "--top", action="store", type="int", dest="top", help="number of top terms to show", default=10)
+	parser.add_option('-d','--debug',type="int",help="Level of log output; 0 is less, 5 is all", default=3)
 	(options, args) = parser.parse_args()
 	if( len(args) < 1 ):
 		parser.error( "Must specify at least one ranking set file" )
+	log.basicConfig(level=max(50 - (options.debug * 10), 10), format='%(asctime)-18s %(levelname)-10s %(message)s', datefmt='%d/%m/%Y %H:%M',)
 
 	# Load each cached ranking set
 	for in_path in args:
-		print "Loading terms from %s ..." % in_path
+		log.info( "Loading terms from %s ..." % in_path )
 		(term_rankings,labels) = unsupervised.util.load_term_rankings( in_path )
 		m = unsupervised.rankings.term_rankings_size( term_rankings )
-		print "Set has %d rankings covering up to %d terms" % ( len(term_rankings), m ) 
+		log.info( "Set has %d rankings covering up to %d terms" % ( len(term_rankings), m ) )
 		print unsupervised.rankings.format_term_rankings( term_rankings, labels, min(options.top,m) )
 
 # --------------------------------------------------------------

diff --git a/generate-nmf.py b/generate-nmf.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 import os, sys, random
+import logging as log
 from optparse import OptionParser
 import numpy as np
 import text.util, unsupervised.nmf, unsupervised.rankings, unsupervised.util
@@ -16,9 +17,11 @@ def main():
 	parser.add_option("-s", "--sample", action="store", type="float", dest="sample_ratio", help="sampling ratio of documents to include in each run (range is 0 to 1)", default=0.8)
 	parser.add_option("-o","--outdir", action="store", type="string", dest="dir_out", help="base output directory (default is current directory)", default=None)
 	parser.add_option("-w","--writefactors", action="store_true", dest="write_factors", help="write complete factorization results")
+	parser.add_option('-d','--debug',type="int",help="Level of log output; 0 is less, 5 is all", default=3)
 	(options, args) = parser.parse_args()
 	if len(args) < 1:
 		parser.error( "Must specify at least one corpus file" )	
+	log.basicConfig(level=max(50 - (options.debug * 10), 10), format='%(asctime)-18s %(levelname)-10s %(message)s', datefmt='%d/%m/%Y %H:%M',)
 	# use nimfa instead of sklearn?
 	use_nimfa = True
 
@@ -40,22 +43,22 @@ def main():
 	n_documents = X.shape[0]
 	n_sample = int( options.sample_ratio * n_documents )
 	indices = np.arange(n_documents)
-	print "* Sampling ratio = %.2f - %d/%d documents per run" % ( options.sample_ratio, n_sample, n_documents )
 
 	# Generate all NMF topic models for the specified numbers of topics
-	print "* Running experiments in range k=[%d,%d] max_iters=%d" % ( options.kmin, options.kmax, options.maxiter)
+	log.info( "Testing models in range k=[%d,%d]" % ( options.kmin, options.kmax ) )
+	log.info( "Sampling ratio = %.2f - %d/%d documents per run" % ( options.sample_ratio, n_sample, n_documents ) )
 	for k in range(options.kmin, options.kmax+1):
 		# Set random state
 		np.random.seed( options.seed )
 		random.seed( options.seed )			
-		print "* Applying NMF k=%d runs=%d (%s) ..." % ( k, options.runs, impl.__class__.__name__ )
+		log.info( "Applying NMF (k=%d, runs=%d, seed=%s - %s) ..." % ( k, options.runs, options.seed, impl.__class__.__name__ ) )
 		dir_out_k = os.path.join( dir_out_base, "nmf_k%02d" % k )
 		if not os.path.exists(dir_out_k):
 			os.makedirs(dir_out_k)		
-		print "Results will be written to %s" % ( dir_out_k )
+		log.debug( "Results will be written to %s" % dir_out_k )
 		# Run NMF
 		for r in range(options.runs):
-			print "Run %d/%d (seed=%s)" % (r+1, options.runs, options.seed )
+			log.info( "NMF run %d/%d (k=%d, max_iters=%d)" % (r+1, options.runs, k, options.maxiter ) )
 			file_suffix = "%s_%03d" % ( options.seed, r+1 )
 			# sub-sample data
 			np.random.shuffle(indices)
@@ -72,23 +75,23 @@ def main():
 				ranked_term_indices = impl.rank_terms( topic_index )
 				term_ranking = [terms[i] for i in ranked_term_indices]
 				term_rankings.append(term_ranking)
-			print "Generated ranking set with %d topics covering up to %d terms" % ( len(term_rankings), unsupervised.rankings.term_rankings_size( term_rankings ) ) 
+			log.debug( "Generated ranking set with %d topics covering up to %d terms" % ( len(term_rankings), unsupervised.rankings.term_rankings_size( term_rankings ) ) )
 			# Write term rankings
 			ranks_out_path = os.path.join( dir_out_k, "ranks_%s.pkl" % file_suffix )
-			print "Writing term ranking set to %s" % ranks_out_path
+			log.debug( "Writing term ranking set to %s" % ranks_out_path )
 			unsupervised.util.save_term_rankings( ranks_out_path, term_rankings )
 			# Write document partition
 			partition = impl.generate_partition()
 			partition_out_path = os.path.join( dir_out_k, "partition_%s.pkl" % file_suffix )
-			print "Writing document partition to %s" % partition_out_path
+			log.debug( "Writing document partition to %s" % partition_out_path )
 			unsupervised.util.save_partition( partition_out_path, partition, sample_doc_ids )			
 			# Write the complete factorization?
 			if options.write_factors:
 				factor_out_path = os.path.join( dir_out_k, "factors_%s.pkl" % file_suffix )
 				# NB: need to make a copy of the factors
-				print "Writing factorization to %s" % factor_out_path
+				log.debug( "Writing factorization to %s" % factor_out_path )
 				unsupervised.util.save_nmf_factors( factor_out_path, np.array( impl.W ), np.array( impl.H ), sample_doc_ids )
-	print "* Done"	  
+
 
 # --------------------------------------------------------------
 

diff --git a/parse-text.py b/parse-text.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 import os, os.path, sys, codecs, re, unicodedata
+import logging as log
 from optparse import OptionParser
 import text.util
 
@@ -9,7 +10,7 @@ def find_documents( root_path ):
 	"""
 	Find all files in the specified directory and its subdirectories, and store them as strings in a list.
 	"""
-	print "Searching %s for documents ..." % root_path
+	log.info( "Searching %s for documents ..." % root_path )
 	filepaths = []
 	for dir_path, subFolders, files in os.walk(root_path):
 		for filename in files:
@@ -48,9 +49,11 @@ def main():
 	parser.add_option("--norm", action="store_true", dest="apply_norm", help="apply unit length normalization to the document-term matrix")
 	parser.add_option("--minlen", action="store", type="int", dest="min_doc_length", help="minimum document length (in characters)", default=50)
 	parser.add_option("-s", action="store", type="string", dest="stoplist_file", help="custom stopword file path", default=None)
+	parser.add_option('-d','--debug',type="int",help="Level of log output; 0 is less, 5 is all", default=3)
 	(options, args) = parser.parse_args()
 	if( len(args) < 1 ):
 		parser.error( "Must specify at least one directory" )	
+	log.basicConfig(level=max(50 - (options.debug * 10), 10), format='%(asctime)-18s %(levelname)-10s %(message)s', datefmt='%d/%m/%Y %H:%M',)
 
 	# Find all relevant files in directories specified by user
 	filepaths = []
@@ -63,10 +66,10 @@ def main():
 			if filename.startswith(".") or filename.startswith("_"):
 				continue
 			filepaths.append( in_path )
-	print "Found %d documents to parse" % len(filepaths)
+	log.info( "Found %d documents to parse" % len(filepaths) )
 
 	# Read the documents
-	print "Reading documents ..."
+	log.info( "Reading documents ..." )
 	docs = []
 	short_documents = 0
 	doc_ids = []
@@ -79,6 +82,7 @@ def main():
 		if not doc_id.startswith(label):
 			doc_id = "%s_%s" % ( label, doc_id )
 		# read body text
+		log.debug( "Reading text from %s ..." % filepath )
 		body = read_text( filepath )
 		if len(body) < options.min_doc_length:
 			short_documents += 1
@@ -90,26 +94,28 @@ def main():
 			label_count[label] = 0
 		classes[label].add(doc_id)
 		label_count[label] += 1
-	print "Kept %d documents. Skipped %d documents with length < %d" % ( len(docs), short_documents, options.min_doc_length )
+	log.info( "Kept %d documents. Skipped %d documents with length < %d" % ( len(docs), short_documents, options.min_doc_length ) )
 	if len(classes) < 2:
-		print "No ground truth available"
+		log.warning( "No ground truth available" )
 		classes = None
 	else:
-		print "Ground truth: %d classes - %s" % ( len(classes), label_count )
+		log.info( "Ground truth: %d classes - %s" % ( len(classes), label_count ) )
 
 	# Convert the documents in TF-IDF vectors and filter stopwords
 	if options.stoplist_file is None:
 		stopwords = text.util.load_stopwords("text/stopwords.txt")
 	else:
-		print "Using custom stopwords from", options.stoplist_file 
+		log.info( "Using custom stopwords from", options.stoplist_file )
 		stopwords = text.util.load_stopwords(options.stoplist_file )
-	print "Pre-processing data (%d stopwords, tfidf=%s, normalize=%s, min_df=%d) ..." % (len(stopwords), options.apply_tfidf, options.apply_norm, options.min_df)
+	log.info( "Pre-processing data (%d stopwords, tfidf=%s, normalize=%s, min_df=%d) ..." % (len(stopwords), options.apply_tfidf, options.apply_norm, options.min_df) )
 	(X,terms) = text.util.preprocess( docs, stopwords, min_df = options.min_df, apply_tfidf = options.apply_tfidf, apply_norm = options.apply_norm )
+	log.info( "Built matrix: rows: %d, terms: %d" % X.shape )
 
 	# Store the corpus
 	prefix = options.prefix
 	if prefix is None:
 		prefix = "corpus"
+	log.info( "Saving corpus '%s'" % prefix )
 	text.util.save_corpus( prefix, X, terms, doc_ids, classes )
 
 # --------------------------------------------------------------

diff --git a/reference-nmf.py b/reference-nmf.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 import os, sys, random
+import logging as log
 from optparse import OptionParser
 import numpy as np
 import text.util, unsupervised.nmf, unsupervised.rankings, unsupervised.util
@@ -15,9 +16,12 @@ def main():
 	parser.add_option("--maxiters", action="store", type="int", dest="maxiter", help="maximum number of iterations", default=200)
 	parser.add_option("-o","--outdir", action="store", type="string", dest="dir_out", help="base output directory (default is current directory)", default=None)
 	parser.add_option("-w","--writefactors", action="store_true", dest="write_factors", help="write complete factorization results")
+	parser.add_option('-d','--debug',type="int",help="Level of log output; 0 is less, 5 is all", default=3)
 	(options, args) = parser.parse_args()
 	if( len(args) < 1 ):
 		parser.error( "Must specify at least one corpus file" )
+	log_level = max(50 - (options.debug * 10), 10)
+	log.basicConfig(level=log_level, format='%(asctime)-18s %(levelname)-10s %(message)s', datefmt='%d/%m/%Y %H:%M',)
 	# use nimfa instead of sklearn?
 	use_nimfa = False
 
@@ -32,7 +36,9 @@ def main():
 
 	# Load the cached corpus
 	corpus_path = args[0]
+	log.info( "Loading corpus from %s ..." % corpus_path )
 	(X,terms,doc_ids,classes) = text.util.load_corpus( corpus_path )
+	log.debug( "Read %s document-term matrix, dictionary of %d terms, list of %d document IDs" % ( str(X.shape), len(terms), len(doc_ids) ) )
 
 	# Choose implementation
 	if use_nimfa:
@@ -41,42 +47,42 @@ def main():
 		impl = unsupervised.nmf.SklNMF(max_iters = options.maxiter, init_strategy = "nndsvd" )
 
 	# Generate reference NMF topic models for the specified numbers of topics
-	print "* Running reference experiments in range k=[%d,%d] max_iters=%d" % ( options.kmin, options.kmax, options.maxiter )
+	log.info( "Running reference experiments in range k=[%d,%d] max_iters=%d" % ( options.kmin, options.kmax, options.maxiter ) )
 	for k in range(options.kmin, options.kmax+1):
-		print "* Applying NMF k=%d (%s) ..." % ( k, impl.__class__.__name__ )
+		log.info( "Applying NMF k=%d (%s) ..." % ( k, impl.__class__.__name__ ) )
 		dir_out_k = os.path.join( dir_out_base, "nmf_k%02d" % k )
 		if not os.path.exists(dir_out_k):
 			os.makedirs(dir_out_k)		
-		print "Results will be written to %s" % ( dir_out_k )
 		impl.apply( X, k )
-		print "Generated W %s and H %s" % ( str(impl.W.shape), str(impl.H.shape) )
+		log.debug( "Generated W %s and H %s" % ( str(impl.W.shape), str(impl.H.shape) ) )
 		# Get term rankings for each topic
 		term_rankings = []
 		for topic_index in range(k):		
 			ranked_term_indices = impl.rank_terms( topic_index )
 			term_ranking = [terms[i] for i in ranked_term_indices]
 			term_rankings.append(term_ranking)
-		print "Writing %d rankings covering up to %d terms" % ( len(term_rankings), unsupervised.rankings.term_rankings_size( term_rankings ) ) 
-		# Print out the top terms
-		if options.top > 0:
+		log.info( "Generated %d rankings covering up to %d terms" % ( len(term_rankings), unsupervised.rankings.term_rankings_size( term_rankings ) ) )
+		# Print out the top terms, if we want verbose output
+		if log_level <= 10 and options.top > 0:
 			print unsupervised.rankings.format_term_rankings( term_rankings, top = options.top )
+
+		log.info( "Writing results to %s" % ( dir_out_k ) )
 		# Write term rankings
 		ranks_out_path = os.path.join( dir_out_k, "ranks_reference.pkl" )
-		print "Writing term ranking set to %s" % ranks_out_path
+		log.debug( "Writing term ranking set to %s" % ranks_out_path )
 		unsupervised.util.save_term_rankings( ranks_out_path, term_rankings )
 		# Write document partition
 		partition = impl.generate_partition()
 		partition_out_path = os.path.join( dir_out_k, "partition_reference.pkl" )
-		print "Writing document partition to %s" % partition_out_path
+		log.debug( "Writing document partition to %s" % partition_out_path )
 		unsupervised.util.save_partition( partition_out_path, partition, doc_ids )
 		# Write the complete factorization?
 		if options.write_factors:
 			factor_out_path = os.path.join( dir_out_k, "factors_reference.pkl" )
 			# NB: need to make a copy of the factors
-			print "Writing complete factorization to %s" % factor_out_path
+			log.debug( "Writing complete factorization to %s" % factor_out_path )
 			unsupervised.util.save_nmf_factors( factor_out_path, np.array( impl.W ), np.array( impl.H ), doc_ids )
 
-	print "* Done"	  
 
 # --------------------------------------------------------------
 

diff --git a/text/util.py b/text/util.py
@@ -19,7 +19,6 @@ def custom_tokenizer( s ):
 		norm_function = None
 	tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) 
 	X = tfidf.fit_transform(docs)
-	print "Built matrix: rows: %d, terms: %d" % X.shape
 	terms = []
 	# store the vocabulary map
 	v = tfidf.vocabulary_
@@ -47,18 +46,13 @@ def save_corpus( out_prefix, X, terms, doc_ids, classes ):
 	Save a pre-processed scikit-learn corpus and associated metadata using Joblib.
 	"""
 	matrix_outpath = "%s.pkl" % out_prefix 
-	print "Saving corpus to %s ..."  %( matrix_outpath )
 	joblib.dump((X,terms,doc_ids,classes), matrix_outpath ) 
 
 def load_corpus( in_path ):
 	"""
 	Load a pre-processed scikit-learn corpus and associated metadata using Joblib.
 	"""
-	print "Loading corpus from %s ..." % in_path
 	(X,terms,doc_ids,classes) = joblib.load( in_path )
-	print "Read %s document-term matrix, dictionary of %d terms, list of %d document IDs" % ( str(X.shape), len(terms), len(doc_ids) )
-	if not classes is None:
-		print "Ground truth (%d): %s" % ( len(classes), classes.keys() )
 	return (X, terms, doc_ids, classes)
 
 
diff --git a/topic-stability.py b/topic-stability.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+import os, sys
+import logging as log
+from optparse import OptionParser
+import numpy as np
+import unsupervised.util
+import unsupervised.rankings
+
+# --------------------------------------------------------------
+
+def main():
+	parser = OptionParser(usage="usage: %prog [options] reference_rank_file test_rank_file1 test_rank_file2 ...")
+	parser.add_option("-t", "--top", action="store", type="int", dest="top", help="number of top terms to use", default=20)
+	parser.add_option('-d','--debug',type="int",help="Level of log output; 0 is less, 5 is all", default=3)
+
+	(options, args) = parser.parse_args()
+	if( len(args) < 2 ):
+		parser.error( "Must specify at least two ranking sets" )
+	log.basicConfig(level=max(50 - (options.debug * 10), 10), format='%(asctime)-18s %(levelname)-10s %(message)s', datefmt='%d/%m/%Y %H:%M',)
+
+	# Load cached ranking sets
+	log.info( "Reading %d term ranking sets (top=%d) ..." % ( len(args), options.top ) )
+	all_term_rankings = []
+	for rank_path in args:
+		# first set is the reference set
+		if len(all_term_rankings) == 0:
+			log.debug( "Loading reference term ranking set from %s ..." % rank_path )
+		else:
+			log.debug( "Loading test term ranking set from %s ..." % rank_path )
+		(term_rankings,labels) = unsupervised.util.load_term_rankings( rank_path )
+		log.debug( "Set has %d rankings covering %d terms" % ( len(term_rankings), unsupervised.rankings.term_rankings_size( term_rankings ) ) )
+		# do we need to truncate the number of terms in the ranking?
+		if options.top > 1:
+			term_rankings = unsupervised.rankings.truncate_term_rankings( term_rankings, options.top )
+			log.debug( "Truncated to %d -> set now has %d rankings covering %d terms" % ( options.top, len(term_rankings), unsupervised.rankings.term_rankings_size( term_rankings ) ) )
+		all_term_rankings.append( term_rankings )
+
+	# First argument was the reference term ranking
+	reference_term_ranking = all_term_rankings[0]
+	all_term_rankings = all_term_rankings[1:]
+	r = len(all_term_rankings)
+	log.info( "Loaded %d non-reference term rankings" % r )
+
+	# Perform the evaluation
+	metric = unsupervised.rankings.AverageJaccard()
+	matcher = unsupervised.rankings.RankingSetAgreement( metric )	
+	log.info( "Performing reference comparisons with %s ..." % str(metric) )
+	all_scores = []
+	for i in range(r):
+		score = matcher.similarity( reference_term_ranking, all_term_rankings[i] )
+		all_scores.append( score )
+
+	# Get overall score across all candidates
+	all_scores = np.array( all_scores )
+	log.info( "Stability=%.4f [%.4f,%.4f]" % ( all_scores.mean(), all_scores.min(), all_scores.max() ) )
+
+# --------------------------------------------------------------
+
+if __name__ == "__main__":
+	main()
diff --git a/validate-topics.py b/validate-topics.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 import os, os.path, sys
+import logging as log
 from optparse import OptionParser
 import text.util, unsupervised.util, unsupervised.validation
 
@@ -8,9 +9,11 @@
 def main():
 	parser = OptionParser(usage="usage: %prog [options] corpus_file input_directory1 input_directory2 ...")
 	parser.add_option("-p", "--precision", action="store", type="int", dest="precision", help="precision for results", default=2)
+	parser.add_option('-d','--debug',type="int",help="Level of log output; 0 is less, 5 is all", default=3)
 	(options, args) = parser.parse_args()
 	if( len(args) < 2 ):
 		parser.error( "Must specify at least a corpus and one input direct containing topic modeling results" )	
+	log.basicConfig(level=max(50 - (options.debug * 10), 10), format='%(asctime)-18s %(levelname)-10s %(message)s', datefmt='%d/%m/%Y %H:%M',)
 
 	# Read the corpus
 	corpus_path = args[0]