In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir( os.path.join('..', 'notebook_format') )
from formats import load_style
load_style()

In [2]:
os.chdir(path)
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8, 6 # change default figure size
plt.rcParams['font.size'] = 12 # and font size

# 1. magic to print version
# 2. magic so that the notebook will reload external python modules
%load_ext watermark
%load_ext autoreload 
%autoreload 2

import warnings
from scipy.optimize import linear_sum_assignment
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,matplotlib,gensim,scikit-learn,scipy

Ethen 2016-11-11 13:40:42 

CPython 3.5.2
IPython 4.2.0

numpy 1.11.2
pandas 0.18.1
matplotlib 1.5.1
gensim 0.13.1
scikit-learn 0.18
scipy 0.18.1


# Stability Analysis for Topic Models

This documentation aims to reproduce the methodology from the paper [How Many Topics? Stability Analysis for Topic Models (2014) Derek Greene, Derek O'Callaghan, Pádraig Cunningham](https://arxiv.org/abs/1404.4606). In the paper, they proposed a topic stability method that tries to address the issue of choosing an appropriate topic number for a given corpus.

## Quick Review

In [3]:
# recent scikit-learn are raising deprecation warnings that
# default value for 'learning_method' will be changed 
# from 'online' to 'batch'; we won't worry about it
warnings.filterwarnings("ignore", category = DeprecationWarning) 

texts = ['human interface computer',
         'survey user computer system response time',
         'eps user interface system',
         'system human system eps',
         'user response time',
         'trees',
         'graph trees',
         'graph minors trees',
         'graph minors survey']


# convert to document-term matrix and train the lda model
vec = CountVectorizer(ngram_range = (1, 2), stop_words = 'english', 
                      min_df = 2, max_df = 0.85, max_features = 10000)
X_dtm = vec.fit_transform(texts)

# train LDA
lda = LatentDirichletAllocation(n_topics = 2, max_iter = 25, 
                                n_jobs = -1, evaluate_every = 5)
doc_topic_distr = lda.fit_transform(X_dtm)

In [4]:
def print_top_words(lda_model, vec, n_top_words):
    """top words associated with each topic for the sklearn LDA model"""
    features = vec.get_feature_names()
    for topic_idx, topic in enumerate(lda_model.components_):
        print( 'Topic #{}:'.format(topic_idx) )
        print( ', '.join([ features[i] for i in np.argsort(topic)[-n_top_words:] ]) )
        print()
    
    print()

N_TOPWORDS = 15
print_top_words(lda_model = lda, vec = vec, n_top_words = N_TOPWORDS)

Topic #0:
graph, graph minors, minors, trees, survey, human, eps, time, computer, interface, response time, response, user

Topic #1:
response, time, computer, response time, interface, human, eps, user, survey, minors, graph minors, trees, graph




## Term Ranking Similarity

Before getting into topic stability, we will need to have two basic ideas in mind:

- The stability of clustering model refers to its ability to consistently generate similar results when applying the same algorithm to the same data source.
- One common output that a topic model reports is the top terms (in the paper, they refer to it as a **ranked list**, hence we'll use top terms and ranked list interchangeably) associated with each topics.

Given these two piece of information, the idea behind topic stability is that we compare the similarity of the topic terms between different runs of the topic model to determine whether it is stable or not.

To formalize it a bit, the output of a topic modeling algorithm is in the form of a ranking set containing $k$ ranked lists, denoted $S = \{R_1,...,R_k\}$. The $i_{th}$ topic produced by the algorithm is represented by the list $R_i$ ($k$ is simply the topic number that we specified, in the sklearn API, this is `n_topics`), containing the top $t$ terms which are most representative of that topic. Usually $t$ is within the range of 10 to 20.

Now that we've obtained the ranked list, a naive approach to assess the similarity between a pair of ranked lists $R_i, R_j$ would be to employ a simple similarity such as the **Jaccard similarity**. 

However, such measures do not take into account positional information, that is terms occurring at the top of a ranked list generated by a topic model algorithm such as LDA will naturally be more relevant to a topic than those occurring at the tail of the list. Also, in practice, rather than considering all $m$ terms in a corpus, it may be preferable to use only the top $t << m$ terms to represent the ranked list.

Therefore, we instead use of a weighted version of the Jaccard index, suitable for calculating the similarity between pairs of indefinite rankings. Specifically, we define the **Average Jaccard (AJ)** measure as follows. We calculate the average of the Jaccard scores between every pair of subsets of d top-ranked terms in two lists, for depth $d \in [1, t]$. That is:

\begin{equation} 
AJ(R_i, R_j) = \frac{1}{t} \sum_{d = 1}^t \gamma_d (R_i, R_j)
\end{equation}

Where:

\begin{equation}
\gamma_d (R_i, R_j) = \frac{ \rvert R_{i, d} \cap R_{j, d} \rvert }{ \rvert R_{i, d} \cup R_{j, d} \rvert }
\end{equation}

such that $R_{i,d}$ is the head of list $R_i$ up to depth $d$. This is a symmetric measure producing values in the range [0,1], where the terms in the ranked list are weighted according to a decreasing linear scale. To demonstrate this, a simple illustrative example is shown below.

<img src="img/jaccard.png" height="70%" width="70%">

Note that, although the Jaccard score at depth $d = 5$ is comparatively high (0.429), the Average Jaccard score is much lower (0.154), as the similarity between terms occurs towards the tails of the lists – these terms carry less weight than those at the head of the lists, such as "album" and "sport".

The following code chunk reproduces the results from the table.

In [6]:
R1 = [
    ['album'],
    ['album', 'music'],
    ['album', 'music', 'best'],
    ['album', 'music', 'best', 'award'],
    ['album', 'music', 'best', 'award', 'win']
]

R2 = [
    ['sport'],
    ['sport', 'best'],
    ['sport', 'best', 'win'],
    ['sport', 'best', 'win', 'medal'],
    ['sport', 'best', 'win', 'medal', 'award']
]

In [7]:
def compute_jaccard(ranking1, ranking2):
    """
    compute jaccard similarity that does not take into account 
    rank positions and indefinite list
    """
    set1 = set(ranking1)
    set2 = set(ranking2)
    
    # if the numerator turned out to be 0, 
    # return 0 as the jaccard similarity
    numerator = len( set1.intersection(set2) )
    if not numerator:
        return 0

    denominator = len( set1.union(set2) )
    jaccard_sim = numerator / denominator
    return jaccard_sim

In [8]:
def compute_avg_jaccard(ranking1, ranking2):
    """
    weighted version of jaccard similarity, 
    which takes into account rank positions
    """
    total = 0
    k = len(ranking1)
    for i in range(1, k + 1):
        total += compute_jaccard( ranking1[:i], ranking2[:i] )
        
    avg_jaccard_sim = total / k
    return avg_jaccard_sim

In [9]:
# test it for the final depth
ranking1 = R1[4]
ranking2 = R2[4]
jaccard = compute_jaccard(ranking1, ranking2)
avg_jaccard = compute_avg_jaccard(ranking1, ranking2)
print('jaccard:', jaccard)
print('average jaccard:', avg_jaccard)

jaccard: 0.42857142857142855
average jaccard: 0.15428571428571428


## Topic Model Agreement

Now that we have a similarity metric for the top-terms, we move on to the problem of measuring the agreement between two different $k$-way topic models, represented as two ranking sets $S_x = \{ R_{x1}, ..., R_{xk} \}$ and $S_y = \{ R_{y1}, ..., R_{yk} \}$, both containing $k$ ranked lists. We construct a $k × k$ similarity matrix $M$, such that the entry $M_{ij}$ indicates the agreement between $R_{xi}$ and $R_{yj}$ (i.e. the i-th topic in the first model and the j-th topic in the second model), as calculated using the Average Jaccard score. We then find the best match between the rows and columns of $M$ (pairs that have the highest similarity between the two ranked set), or so called the optimal permutation, denoted as $\pi$. From this we can product an agreement score:

\begin{equation}
agree(S_x, S_y) = \frac{1}{k} \sum_{i = 1}^k AJ \big( R_{xi}, \pi(R_{xi}) \big)
\end{equation}

Where $\pi(R_{xi})$ denotes the ranked list in $S_y$ matched to $R_{xi}$ by the permutation $\pi$. Hungarian method is to find the optimal permutation, details of this method is included in the appendix.

A simple example illustrating the agreement process is shown below Fig. 1.

<img src="img/agreement.png" height="70%" width="70%">

In [9]:
# compare between two ranking the set
# where the length of the set is simply
# the number of topics in each topic model
S1 = [
    ['sport', 'win', 'award'],
    ['bank', 'finance', 'money'],
    ['music', 'album', 'band']
]

S2 = [
    ['finance', 'bank', 'economy'],
    ['music', 'band', 'award'],
    ['win', 'sport', 'money']
]

In [10]:
# reproduce the similarity matrix
n_topic = len(S1)
sim_mat = np.zeros(( n_topic, n_topic ))
for row in range(n_topic):  
    for col in range(n_topic):
        sim_mat[row, col] = compute_avg_jaccard(S1[row], S2[col])

sim_mat

array([[ 0.        ,  0.06666667,  0.5       ],
       [ 0.5       ,  0.        ,  0.06666667],
       [ 0.        ,  0.61111111,  0.        ]])

In [11]:
# solve for the optimal permutation using hungarian algorithm,
# for the scipy implementation, each element is presented as cost
# hence we take the negative sign of the similarity matrix
row_ind, col_ind = linear_sum_assignment(-sim_mat)
agreement = np.mean( sim_mat[row_ind, col_ind] )
agreement

0.53703703703703709

In [12]:
# simply wrapping all of it in one function
def compute_agreement(S1, S2):
    """
    measuring the agreement between two different 
    k-way topic models, represented as two rank sets;
    the rank set is simply the top words for each topic
    """
    
    # compute the similarity matrix
    n_topic = len(S1)
    sim_mat = np.zeros(( n_topic, n_topic ))
    for row in range(n_topic):
        for col in range(n_topic):
            sim_mat[row, col] = compute_avg_jaccard(S1[row], S2[col])    
    
    # solve for the optimal permutation using hungarian algorithm,
    # for the scipy implementation, each element is presented as cost
    # hence we use the negative sign of the similarity matrix as input
    row_ind, col_ind = linear_sum_assignment(-sim_mat)
    agreement = np.mean( sim_mat[row_ind, col_ind] )
    return agreement

In [13]:
agreement = compute_agreement(S1, S2)
agreement

0.53703703703703709

## Selecting the Number of Topics

Building on top of the agreement score, the topic number selction process is defined as follows:

- Randomly generate $\gamma$ samples of the full corpus, each containing $\beta \times n$ documents. $n$ denotes the total number of documents, and $\beta$ is $0 < \beta < 1$ denotes the sampling ratio controlling the number of documents in each sample.
- For each value of $k \in [kmin, kmax]$ (a defined range of topic numbers to search for):
    - Apply the topic modeling algorithm to the complete data set of $n$ documents to generate $k$ topics, and represent the output as the reference ranking set $S_0$.
    - For each sample $\gamma$:
        - Apply the topic modeling algorithm to it to generate $k$ topics, and represent the output as the ranking set $S_i$. 
        - Calculate the agreement score agree ($S_0$, $S_i$).
    - Compute the mean agreement score for $k$ over all $\gamma$ samples. This measure is the overall topic stability at $k$ topics.
    
After going through the whole process, we can examine the plot of the stability scores ranging from $[kmin, kmax]$ and the optimal value for $k$ may be identified based on peaks in the plot.

In [14]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_all = fetch_20newsgroups(subset = 'all')

In [None]:
X = np.array(newsgroups_all.data)
n_top_words = 20
n_topics_range = [2, 10, 20, 30]
n_sample_frac = 0.8
n_sample_time = 2

In [None]:
from topic_stability import TopicStability

ts = TopicStability(
    vec = vec, 
    lda_model = lda, 
    n_top_words = n_top_words, 
    n_topics_range = n_topics_range, 
    n_sample_frac = n_sample_frac, 
    n_sample_time = n_sample_time
)
ts.fit(X)
print('best number of topic:', ts.best_n_topic)



In [None]:
plt.plot(n_topics_range, avg_agreements)
plt.show()

fig = plt.figure( figsize = (10, 6) )

# for acessing the color cycle
# http://stackoverflow.com/questions/34247297/matplotlib-1-5-usage-of-axes-prop-cycle
colors = list(plt.rcParams['axes.prop_cycle'])

for k in range( len(n_topics_range) ):    
    plt.subplot(2, 2, k + 1)
    prob = max_probs[k]
    mean_prob = np.round( np.mean(prob), 2 )
    plt.hist( prob, histtype = 'stepfilled', alpha = 0.85, 
              color = colors[k]['color'], bins = 30 )
    plt.title( '{} topics, mean prob of {}'.format( n_topics_range[k], mean_prob ) )
    plt.axvline(x = 0.5, color = "black", linestyle = '--')
    
fig.tight_layout()
fig.show()

## Reference

- [How Many Topics? Stability Analysis for Topic Models (2014) Derek Greene, Derek O'Callaghan, Pádraig Cunningham](https://arxiv.org/abs/1404.4606)