In [1]:
import numpy as np
from scipy.sparse import csr_matrix

In [2]:
def count_tf_idf_repr(topics, cw_words, tf_idf_t):
    cw_frequency = {}
    cw_docs = {}
    for iter_topic in topics:
        for word in iter_topic:
            word_index = np.where(cw_words == word)[0]
            cw_frequency[word] = float(tf_idf_t[word_index].getnnz(1))
            cw_docs[word] = set(tf_idf_t[word_index].nonzero()[1])

    n_docs = 0
    for _cw in range(tf_idf_t.shape[0]):
        n_docs += float(tf_idf_t[_cw].getnnz(1))

    return cw_frequency, cw_docs, n_docs

In [3]:
def pmi(topics, word_frequency, term_docs, n_docs, n_top_words):
    pmi = []
    npmi = []

    n_top_words = float(n_top_words)

    for t in range(len(topics)):
        top_w = topics[t]
        # top_w = topico.split(' ')

        pmi_t = 0.0
        npmi_t = 0.0

        for j in range(1, len(top_w)):
            for i in range(0, j):
                ti = top_w[i]
                tj = top_w[j]

                c_i = word_frequency[ti]
                c_j = word_frequency[tj]
                c_i_and_j = len(term_docs[ti].intersection(term_docs[tj]))

                pmi_t += np.log(((c_i_and_j + 1.0) / float(n_docs)) /
                                ((c_i * c_j) / float(n_docs) ** 2))

                npmi_t += -1.0 * np.log((c_i_and_j + 0.01) / float(n_docs))

        peso = 1.0 / (n_top_words * (n_top_words - 1.0))

        pmi.append(peso * pmi_t)
        npmi.append(pmi_t / npmi_t)

    return pmi, npmi

## Settings

In [4]:
dataset = "uber"
source = "../fasttext_wiki_bert_max"
top_words = 20
hierarchical_file = f"{source}/results/{dataset}/hierarchical_struture.txt"
npz_input_file = f"{source}/results/{dataset}/cluwords_representation_{dataset}.npz"

In [5]:
topics = {
    0: list(),
    1: list(),
    2: list()
}
topics

{0: [], 1: [], 2: []}

In [6]:
with open(hierarchical_file) as hierachical_input:
    for topic in hierachical_input:
        if topic.startswith("\t\t"):
            topics[2].append(topic.replace("\t\t", "").strip().split(" "))
        elif topic.startswith("\t"):
            topics[1].append(topic.replace("\t", "").strip().split(" "))
        else:
            topics[0].append(topic.strip().split(" "))

In [7]:
loaded  = np.load(npz_input_file)
cluwords_repr = loaded['tfidf']
cluwords_vocab = loaded['feature_names']

## Depth 0

In [8]:
cluwords_freq, cluwords_docs, n_docs = count_tf_idf_repr(topics[0],
                                                         cluwords_vocab,
                                                         csr_matrix(cluwords_repr).transpose())

In [9]:
pmi_0, npmi_0 = pmi(topics=topics[0],
                word_frequency=cluwords_freq,
                term_docs=cluwords_docs,
                n_docs=n_docs,
                n_top_words=top_words)

## Depth 1

In [10]:
cluwords_freq, cluwords_docs, n_docs = count_tf_idf_repr(topics[1],
                                                         cluwords_vocab,
                                                         csr_matrix(cluwords_repr).transpose())

In [11]:
pmi_1, npmi_1 = pmi(topics=topics[1],
                word_frequency=cluwords_freq,
                term_docs=cluwords_docs,
                n_docs=n_docs,
                n_top_words=top_words)

## Depth 2

In [12]:
cluwords_freq, cluwords_docs, n_docs = count_tf_idf_repr(topics[2],
                                                         cluwords_vocab,
                                                         csr_matrix(cluwords_repr).transpose())

In [13]:
pmi_2, npmi_2 = pmi(topics=topics[2],
                word_frequency=cluwords_freq,
                term_docs=cluwords_docs,
                n_docs=n_docs,
                n_top_words=top_words)

## Prints

In [14]:
print(f"Depth_0 {np.mean(npmi_0)} {np.std(npmi_0, ddof=1)}")
print(f"Depth_1 {np.mean(npmi_1)} {np.std(npmi_1, ddof=1)}")
print(f"Depth_2 {np.mean(npmi_2)} {np.std(npmi_2, ddof=1)}")
print(f"Overall {np.mean(npmi_0 + npmi_1 + npmi_2)} {np.std(npmi_0 + npmi_1 + npmi_2, ddof=1)}")

Depth_0 0.9596271989177383 0.023698397054884413
Depth_1 0.9397599022420315 0.034727460690898666
Depth_2 0.9022070756863788 0.04083932554365045
Overall 0.9069380827160495 0.042099472650988255
