# 3 - Topic Model Evaluation

In [None]:
from pathlib import Path
import joblib, itertools
import gensim
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

Settings:

In [None]:
# input directory
dir_models = Path("../models")
# word embedding file to use for evaluation purposes
embedding_path = Path("../embeddings") / "wikipedia2016-w2v-cbow-d100.bin"

# number of top terms to consider
top = 10

## Preparation

Find all topic model descriptor files:

In [None]:
file_paths = []
for f in dir_models.glob('**/*ranks*.pkl'):
    file_paths.append(f)
file_paths.sort()
print("Found %d text model files to load" % len(file_paths))    

Load the word embedding model used during evaluation. This should be stored in the binary word2vec format used by Gensim:

In [None]:
print("Loading word embedding from %s ..." % embedding_path)
embedding = gensim.models.KeyedVectors.load_word2vec_format(embedding_path, binary=True)
vocab = set(embedding.index_to_key)
print("Embedding has vocabulary of size %d" % len(vocab))

## Topic Model Evaluation Metrics

Implementation of an embedding-based topic distinctiveness score, where normalization is loosely based on the min-max cluster similarity measure proposed by Ding et al (2001). Note that for this measure better models will have lower scores.

In [None]:
class MinMaxScore:
    def __init__(self, embedding):
        self.embedding = embedding

    def evaluate_model(self, descriptors):
        """ Calculate the overall model score based on the mean score across all unique pairs
        of topics """
        topic_pair_scores = []
        for descriptor1, descriptor2 in itertools.combinations(descriptors, 2):
            sim = self.evaluate_similarity(descriptor1, descriptor2)
            topic_pair_scores.append(sim)
        return np.array(topic_pair_scores).mean()

    def evaluate_similarity(self, descriptor1, descriptor2):
        """ Calculate the normalized similarity score """
        numer = self.evaluate_raw_similarity(descriptor1, descriptor2)
        denom = (self.evaluate_raw_similarity(descriptor1, descriptor1) * self.evaluate_raw_similarity(descriptor2, descriptor2))
        if denom == 0:
            return 0.0
        return numer/denom

    def evaluate_raw_similarity(self, descriptor1, descriptor2):
        """ Calculate the raw (non-normalized) similarity score """
        pair_scores = []
        for term1 in descriptor1:
            for term2 in descriptor2:
                if term1 in self.embedding and term2 in self.embedding:
                    # threshold negative values
                    s = max(self.embedding.similarity(term1, term2), 0)
                    pair_scores.append(s)
        if len(pair_scores) == 0:
            return 0.0
        return np.array(pair_scores).mean()

## Topic Model Evaluation Process

Process each topic model results file

In [None]:
metric = MinMaxScore(embedding)

In [None]:
print( "Processing %d topic models ..." % len(file_paths))
scores = {}
for in_path in file_paths:
    k = int(in_path.parent.name.split("_k")[1])
    print("k=%02d: %s" % (k,in_path))
    term_rankings, _ = joblib.load(in_path)
    # only take the top terms for the topics
    truncated_rankings = []
    for ranking in term_rankings:
        truncated_rankings.append(ranking[0:min(len(ranking),top)])
    # apply the evaluation metric
    scores[k] = metric.evaluate_model(truncated_rankings)

Analyse the scores for the different models. Note a lower score is better.

In [None]:
df_scores = pd.Series(scores, name="min-max").to_frame()
# list best models
df_scores.sort_values(by="min-max").head(10)

In [None]:
# generate plot of score vs number of topics
ax = df_scores.plot(fontsize=13, color="darkorange", figsize=(9, 5.5))
kmin, kmax = min(df_scores.index), max(df_scores.index)
ax.get_legend().remove()
ax.set_xlabel("Number of Topics ($k$)", fontsize=13)
ax.set_ylabel("Min-Max Score", fontsize=13)
ax.set_xlim(kmin, kmax);

Export the results:

In [None]:
df_scores.to_csv("results.csv")