# 2 - Topic Modelling

This notebook applies NMF topic modelling to the preprocessed Paris Review data from Notebook 1 for a range of different topic numbers $k$.

In [None]:
from pathlib import Path
import joblib, random
import numpy as np
from sklearn import decomposition

Settings:

In [None]:
# input data paths
dir_data = Path("../data")
dir_in = dir_data / "proc"
data_prefix = "paris"
data_in_path = dir_in / ("%s.pkl" % data_prefix)
# output paths
dir_out = Path("../models")

# topic model settings
random_seed = 1000
kmin, kmax = 2, 50
kstep = 1
init_strategy = "nndsvd"
max_iters = 200

## Data Loading

Read the preprocess dataset produced by Notebook 1:

In [None]:
(X, terms, doc_ids) = joblib.load(data_in_path)
print("Loaded preprocessed data: %d documents, %d terms" % (len(doc_ids), len(terms)))

## Topic Modelling

Function to initialize all required random seeds.

In [None]:
def init_random_seeds(random_seed):
    if random_seed < 0:
        random_seed = random.randint(1,100000)
    np.random.seed(random_seed)
    random.seed(random_seed)
    return random_seed

In [None]:
def apply_nmf(X, k):
    """ Apply NMF for k topics, returning the factors and the document partition """
    model = decomposition.NMF(init=init_strategy, n_components=k, max_iter=max_iters)
    W = model.fit_transform(X)
    H = model.components_
    partition = np.argmax(W, axis=1).flatten().tolist()
    return (W, H, partition)

Function to return the top ranked terms for the specified topic, generated during the last NMF run:

In [None]:
def rank_terms(H, topic_index, top=-1):
    # NB: reverse
    top_indices = np.argsort(H[topic_index,:])[::-1]
    # truncate if necessary
    if top < 1 or top > len(top_indices):
        return top_indices
    return top_indices[0:top]

Functions for saving outputs:

In [None]:
def save_term_rankings(out_path, term_rankings, labels=None):
    """ Save a list of multiple term rankings using Joblib """
    # no labels? generate some default ones
    if labels is None:
        labels = []
        for i in range(len(term_rankings)):
            labels.append("C%02d" % (i+1))
    joblib.dump((term_rankings,labels), out_path)
    
def save_nmf_factors(out_path, W, H, doc_ids, terms):
    """ Save a NMF factorization result using Joblib. """
    joblib.dump((W,H,doc_ids,terms), out_path) 

def save_partition(out_path, partition, doc_ids):
    """
    Save a disjoint partition documments result using Joblib.
    This is represent as a 0-indexed list, with one entry per document.
    """
    joblib.dump((partition,doc_ids), out_path)

Apply NMF for a range of values of K:

In [None]:
print("Generating NMF models in range k=[%d,%d], init_strategy=%s max_iters=%d" 
      % (kmin, kmax, init_strategy, max_iters))
for k in range(kmin, kmax+1, kstep):
    print("Applying K=%d ..." % k)
    # set output direcotry
    dir_out_k = dir_out_k = dir_out / ("nmf_k%02d" % k)
    dir_out_k.mkdir(parents=True, exist_ok=True)
    # set the current random state
    init_random_seeds(random_seed)
    # apply NMF
    W, H, partition = apply_nmf(X, k)
    # get term rankings for each topic
    term_rankings = []
    for topic_index in range(k):
        ranked_term_indices = rank_terms(H, topic_index)
        term_ranking = [terms[i] for i in ranked_term_indices]
        term_rankings.append(term_ranking)
    # write term rankings
    fname_ranks = "%s_ranks.pkl" % data_prefix
    ranks_out_path = dir_out_k / fname_ranks
    save_term_rankings(ranks_out_path, term_rankings)
    # write document partition
    fname_partition = "%s_partition.pkl" % data_prefix
    partition_out_path = dir_out_k / fname_partition
    save_partition(partition_out_path, partition, doc_ids)
    # write the complete factorization
    fname_factors = "%s_factors.pkl" % data_prefix
    factor_out_path = dir_out_k / fname_factors
    # NB: need to make a copy of the factors
    save_nmf_factors(factor_out_path, np.array(W), np.array(H), doc_ids, terms)    
    print("Results saved to %s" % dir_out_k)