In [None]:
from __future__ import division

In [None]:
import re
import os

import numpy as np
import pandas as pd

from collections import defaultdict, namedtuple
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import NMF

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from munkres import Munkres

In [None]:
StemmedDocument = namedtuple("StemmedDocument", ["name", "word_counts"])

In [None]:
FOLDER = "../documents/"

In [None]:
def get_filenames(folder):
    return sorted([f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))])

  
def read_article(folder, name):
    content = defaultdict(int)
    stop_words = stopwords.words("english")
    stemmer = PorterStemmer()
    with open(FOLDER + name, "r") as article:
        for line in article:
            for word in line.split():
                word = word.lower()
                word = re.sub("^[^a-z]*|[^a-z]*$", "", word)
                if word and word not in stop_words:
                    word = stemmer.stem(word)
                    content[word] += 1
    return dict(content)


def read_documents(folder):
    documents = []
    filenames = get_filenames(folder)
    for fname in filenames:
        if not fname.startswith("summary"):
            content = read_article(folder, fname)
            documents.append(StemmedDocument(fname, content))
    return documents

In [None]:
documents = read_documents(FOLDER)

In [None]:
documents[0]

In [None]:
def create_document_term_matrix(documents):
    vectorizer = DictVectorizer(dtype=int, sparse=True)
    count_matrix = vectorizer.fit_transform(map(lambda x: x.word_counts, documents))
    terms = vectorizer.get_feature_names()
    transformer = TfidfTransformer(norm="l2", sublinear_tf=True)
    term_matrix = transformer.fit_transform(count_matrix)
    document_titles = list(map(lambda x: x.name, documents))
    return term_matrix, document_titles, terms

In [None]:
term_matrix, titles, terms = create_document_term_matrix(documents)

In [None]:
titles

In [None]:
terms[:10]

Calculate topic stability to decide how many topics are there.

For each size $k$ reference topics are calculated. Each topic is represented by a list containing the indices of the top 20 words. Then $80\%$ of the documents are selected and topic representations are recalculated. Now we have two set of the same number of ordered lists having the same number of elements. Then we calculate a distance matrix between the topics in the reference topics and the sampled topics, respectively. Then for each topic in the reference topic we assign a topic in the sample topics in a bijective manner using the Hungarian Method.

As an example, imagine that the top $4$ terms in $3$ topics are
* tennis, racket, grass, wimbledon
* pollution, coal, plant, electricity
* electricity, storm, severe, wimbledon

and the sampled documents resulted in the following top words:

* tennis, semi, racket, crown
* pollution, coal, nuclear, plant
* storm, weather, electricity, severe

The resulting similarity matrix based on average Jaccard-distance is
\begin{matrix} 0.5417 & 0 & 0 \\ 0 & 0.775 & 0.0357 \\ 0 & 0 & 0.3583 \end{matrix}

This is not a symmetric matrix! The last task is to pair a reference topic with a sample topic such that their total similarities maximized (or the sitance is minimized), which is now $\frac{0.5417 + 0.775 + 0.3583}{3} = 0.558$.

In [None]:
SAMPLING_RATE = 0.8
NUMBER_OF_TRIALS = 40

NumberOfTopics = namedtuple("NumberOfTopics", ["size", "stability_score"])


def get_top_terms_from_topics(term_matrix, cluster_number, nr_top_terms):
    model = NMF(n_components=cluster_number, init="nndsvd", solver="cd", alpha=0.3, random_state=0).fit(term_matrix)
    top_terms_in_each_topic = []
    for topic in model.components_:
        top_terms = tuple(ix for ix in np.flipud(topic.argsort())[:nr_top_terms])
        top_terms_in_each_topic.append(top_terms)
    return tuple(top_terms_in_each_topic)    


def calc_average_jaccard_measure(reference_topic, sample_topic, nr_top_terms):
    measure = 0
    for d_value in range(1, nr_top_terms + 1):
        top_d_reference_terms = set(reference_topic[:d_value])
        top_d_sample_terms = set(sample_topic[:d_value])
        intersection_of_top_d_terms = top_d_reference_terms.intersection(top_d_sample_terms)
        union_of_top_d_terms = top_d_reference_terms.union(top_d_sample_terms)
        measure += len(intersection_of_top_d_terms) / len(union_of_top_d_terms)
    return measure / nr_top_terms


def calc_similarity_matrix(reference_topics, sample_topics, cluster_number, top_terms):
    similarity_matrix = np.zeros((cluster_number, cluster_number))
    for ix, reference_topic in enumerate(reference_topics):
        for jx, sample_topic in enumerate(sample_topics):
            similarity_matrix[ix, jx] = calc_average_jaccard_measure(reference_topic, sample_topic, top_terms)
    return similarity_matrix


def calc_agreement_score(reference_topics, sample_topics, cluster_number, nr_top_terms):
    similarity_matrix = calc_similarity_matrix(reference_topics, sample_topics, cluster_number, nr_top_terms)
    HungarianMethod = Munkres()
    maximal_agreement_path = HungarianMethod.compute(1 - similarity_matrix)
    return sum([similarity_matrix[ix, jx] for ix, jx in maximal_agreement_path]) / cluster_number


def calc_stability_score(term_matrix, cluster_number, nr_top_terms):
    reference_topics = get_top_terms_from_topics(term_matrix, cluster_number, nr_top_terms)
    number_of_documents = term_matrix.shape[0]
    np.random.seed(seed=1)
    stability_score = 0
    for _ in range(NUMBER_OF_TRIALS):
        sample_rows = np.random.choice(number_of_documents, int(SAMPLING_RATE * number_of_documents), replace=False)
        sample_topics = get_top_terms_from_topics(term_matrix[sample_rows, :], cluster_number, nr_top_terms)
        stability_score += calc_agreement_score(reference_topics, sample_topics, cluster_number, nr_top_terms)
    return stability_score / NUMBER_OF_TRIALS


def estimate_number_of_clusters_by_topic_stability(term_matrix, cluster_number_candidates):
    number_of_documents, number_of_terms = term_matrix.shape
    maximum_number_of_clusters = min(number_of_terms, int(SAMPLING_RATE * number_of_documents))
    cluster_number_candidates = [cluster_number for cluster_number in cluster_number_candidates
                                 if cluster_number <= maximum_number_of_clusters]
    nr_top_terms = min(20, number_of_terms)
    stability_scores = []
    for cluster_number in cluster_number_candidates:
        stability = calc_stability_score(term_matrix, cluster_number, nr_top_terms)
        stability_scores.append(NumberOfTopics(cluster_number, stability))
    optimal_cluster_number = max(stability_scores, key=lambda item: item.stability_score)[0]
    return optimal_cluster_number, stability_scores

In [None]:
cluster_number_candidates = range(3, 10)
number_of_topics, stabilities = estimate_number_of_clusters_by_topic_stability(term_matrix, cluster_number_candidates)

In [None]:
number_of_topics

In [None]:
stabilities

In [None]:
model = NMF(n_components=number_of_topics, init="nndsvd", solver="cd", alpha=0.3, random_state=0)

In [None]:
W = model.fit_transform(term_matrix)
H = model.components_

In [None]:
def print_top_words(matrix, feature_names, n_top_words):
    for topic_id, topic in enumerate(matrix):
        topic = topic / np.linalg.norm(topic)
        print("Topic #%d:" % topic_id)
        print(" | ".join([str(feature_names[ix]) for ix in topic.argsort()[:-n_top_words-1:-1] if topic[ix] > 1e-12]))
        print("\n")

In [None]:
print_top_words(H, terms, 20)

In [None]:
print_top_words(W.T, titles, 5)

In [None]:
def get_topic_coverage(matrix, users):
    matrix = (matrix.T / matrix.sum(axis=1)).T
    number_of_topics = matrix.shape[1]
    colnames = ["Topic #{}".format(str(ix)) for ix in range(number_of_topics)]
    return pd.DataFrame(data=np.round(100 * matrix, decimals=2), columns=colnames, index=users)

  
def get_user_importance_in_topics(matrix, users):
    matrix = matrix / matrix.sum(axis=0)
    number_of_users, number_of_topics = matrix.shape
    importance = np.zeros((number_of_users, number_of_topics), dtype=np.object)
    important_user_indexes = np.flipud(np.argsort(matrix, axis=0))

    for ix, user in enumerate(users):
        row = [important_user_indexes[:, jx].tolist().index(ix) + 1 if matrix[ix, jx] > 1e-12 else ""
               for jx in range(number_of_topics)]
        importance[ix, :] = np.array(row)

    colnames = ["Topic #{}".format(str(ix)) for ix in range(number_of_topics)]
    return pd.DataFrame(data=importance, columns=colnames, index=users)

def get_user_weight_percentages_in_topics(matrix, users):
    matrix = matrix / matrix.sum(axis=0)
    number_of_users, number_of_topics = matrix.shape
    colnames = ["Topic #{}".format(str(ix)) for ix in range(number_of_topics)]
    return pd.DataFrame(data=np.round(100 * matrix, decimals=4), columns=colnames, index=users)

In [None]:
#Topic coverage for each document (percents)
df_coverage = get_topic_coverage(W, titles)
df_coverage.head(n=15)

In [None]:
#document importance in topics
#For each document the importance of that document in each topic (that is, how important a document in a given cluster).
#If the weight of the document in a topic is zero then we do not assign importance to that document in that topic.
df_importance = get_user_importance_in_topics(W, titles)
df_importance.head(n=15)

In [None]:
#document weight percentages in topics.
df_weight_percents = get_user_weight_percentages_in_topics(W, titles)
df_weight_percents.head(n=15)