In [1]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

## NPMI

In [2]:
def count_tf_idf_repr(topics, cw_words, tf_idf_t):
    cw_frequency = {}
    cw_docs = {}
    for iter_topic in topics:
        for word in iter_topic:
            word_index = np.where(cw_words == word)[0]
            cw_frequency[word] = float(tf_idf_t[word_index].data.shape[0])
            cw_docs[word] = set(tf_idf_t[word_index].nonzero()[1])

    n_docs = float(tf_idf_t.data.shape[0])

    return cw_frequency, cw_docs, n_docs

In [3]:
def pmi(topics, word_frequency, term_docs, n_docs, n_top_words):
    pmi = []
    npmi = []

    n_top_words = float(n_top_words)

    for t in range(len(topics)):
        top_w = topics[t]
        # top_w = topico.split(' ')

        pmi_t = 0.0
        npmi_t = 0.0

        for j in range(1, len(top_w)):
            for i in range(0, j):
                ti = top_w[i]
                tj = top_w[j]

                c_i = word_frequency[ti]
                c_j = word_frequency[tj]
                c_i_and_j = len(term_docs[ti].intersection(term_docs[tj]))

                pmi_t += np.log(((c_i_and_j + 1.0) / float(n_docs)) /
                                ((c_i * c_j) / float(n_docs) ** 2))

                npmi_t += -1.0 * np.log((c_i_and_j + 0.01) / float(n_docs))

        peso = 1.0 / (n_top_words * (n_top_words - 1.0))

        pmi.append(peso * pmi_t)
        npmi.append(pmi_t / npmi_t)

    return pmi, npmi

## Coherence

In [None]:
def coherence(topics, word_frequency, term_docs):
    coherence = []

    for t in range(len(topics)):
        top_w = topics[t]

        coherence_t = 0.0
        for i in range(1, len(top_w)):
            for j in range(0, i):
                cont_wi = word_frequency[top_w[j]]
                cont_wi_wj = float(
                    len(term_docs[top_w[j]].intersection(term_docs[top_w[i]])))
                coherence_t += np.log((cont_wi_wj + 1.0) / cont_wi)

        coherence.append(coherence_t)

    return coherence

## W2V-L1

In [None]:
from gensim.models import KeyedVectors

def w2v_metric(topics, word_embedding_path, distance_type="cos_dist", top_words=10, embedding_type=False):
        word_vectors = KeyedVectors.load_word2vec_format(f"{word_embedding_path}", binary=embedding_type)
#         model = word_vectors.wv
        values = []

        for words in topics:
#             words = topic.split(' ')
            value = calc_dist_2(words, word_vectors, distance_type, top_words)
            values.append(value)

        return values


In [None]:
import scipy.spatial.distance as sci_dist

def calc_dist_2(words, w2v_model, distance_type, t):
    l1_dist = 0
    l2_dist = 0
    cos_dist = 0
    coord_dist = 0
    t = float(t)

    for word_id1 in range(len(words)):
        for word_id2 in range(word_id1 + 1, len(words)):
            # Calcular L1 w2v metric
            l1_dist += (sci_dist.euclidean(
                w2v_model[words[word_id1]], w2v_model[words[word_id2]]))

            # Calcular L2 w2v metric
            l2_dist += (sci_dist.sqeuclidean(
                w2v_model[words[word_id1]], w2v_model[words[word_id2]]))

            # Calcular cos w2v metric
            cos_dist += (sci_dist.cosine(
                w2v_model[words[word_id1]], w2v_model[words[word_id2]]))

            # Calcular coordinate w2v metric
            coord_dist += (sci_dist.sqeuclidean(
                w2v_model[words[word_id1]], w2v_model[words[word_id2]]))

    if distance_type == 'l1_dist':
        return l1_dist / (t * (t - 1.0))
    elif distance_type == 'l2_dist':
        return l2_dist / (t * (t - 1.0))
    elif distance_type == 'cos_dist':
        return cos_dist / (t * (t - 1.0))
    elif distance_type == 'coord_dist':
        return coord_dist / (t * (t - 1.0))

    return .0


## Load Representation

In [4]:
def read_input(input_path):
    arq = open(input_path, 'r', encoding="utf-8")
    doc = arq.readlines()
    arq.close()
    documents = list(map(str.rstrip, doc))
    n_documents = len(documents)
    return documents

In [5]:
def tfidf(data):
    vectorizer = TfidfVectorizer(encoding='utf-8', 
                                 analyzer='word', 
                                 max_df=1.0, 
                                 min_df=1,
                                 norm='l2', 
                                 use_idf=True, 
                                 smooth_idf=False, 
                                 sublinear_tf=True)
    X = vectorizer.fit_transform(data)
    return X, vectorizer.get_feature_names()

In [6]:
dataset = "wpp"
source_dataset = "../textual_folds"
dataset_input_file = f"{source_dataset}/{dataset}Pre.txt"

In [7]:
data = read_input(input_path=dataset_input_file)
X, feature_names = tfidf(data=data)
X.shape, len(feature_names)

((2956, 1777), 1777)

## Settings

In [8]:
baseline="bertopic"
top_words = 5

### BertTopic

In [9]:
if baseline == 'bertopic':
    source = "../BertTopicResults"
    hierarchical_file = f"{source}/topic_words_{top_words}_{dataset}Pre"

### HPAM

In [10]:
if baseline == "hpam":
    source = "../HPAMResults"
    hierarchical_file = f"{source}/{dataset}.txt"

### HLDA

In [None]:
# TODO
if baseline == "hlda":
    source = ""
    hierarchical_file = f"{source}/{dataset}.txt"

In [11]:
topics = {
    0: list(),
    1: list(),
    2: list()
}
topics

{0: [], 1: [], 2: []}

In [12]:
if baseline == "bertopic":
    with open(hierarchical_file) as hierachical_input:
        for topic in hierachical_input:
            if topic.startswith("\t\t"):
                topics[2].append(topic.replace("\t\t", "").strip().split(" "))
            elif topic.startswith("\t"):
                topics[1].append(topic.replace("\t", "").strip().split(" "))
            else:
                topics[0].append(topic.strip().split(" "))
        
        hierachical_input.close()


In [13]:
import re

if baseline == "hpam":
    with open(hierarchical_file) as hierachical_input:
        for topic in hierachical_input:
            if topic.startswith("Super-topic"):
                topics[1].append(topic.replace("\t", " ").strip().split(" ")[4:])
            elif topic.startswith("Root:"):
                topics[0].append(topic.replace("]", " ").strip().split(" ")[2:])
            elif re.match("[0-9]+:", topic):
                topics[2].append(topic.replace("\t", " ").strip().split(" ")[2:])
        
        hierachical_input.close()


In [14]:
# topics[0]

## Depth 0

In [15]:
features_freq, features_docs, n_docs = count_tf_idf_repr(topics[0],
                                                         np.asarray(feature_names),
                                                         csr_matrix(X).transpose())

In [16]:
pmi_0, npmi_0 = pmi(topics=topics[0],
                    word_frequency=features_freq,
                    term_docs=features_docs,
                    n_docs=n_docs,
                    n_top_words=top_words)

In [None]:
coherence_0 = coherence(
                      topics=topics[0],
                      word_frequency=cluwords_freq,
                      term_docs=cluwords_docs
                     )

In [None]:
w2v_score_0 = w2v_metric(
                        topics=topics[0], 
                        word_embedding_path="/home/felipeviegas/Codes_phd/cluhtm/wiki-news-300d-1M.vec"
                      )

## Depth 1

In [17]:
features_freq, features_docs, n_docs = count_tf_idf_repr(topics[1],
                                                         np.asarray(feature_names),
                                                         csr_matrix(X).transpose())

In [18]:
pmi_1, npmi_1 = pmi(topics=topics[1],
                    word_frequency=features_freq,
                    term_docs=features_docs,
                    n_docs=n_docs,
                    n_top_words=top_words)

In [None]:
coherence_1 = coherence(
                      topics=topics[1],
                      word_frequency=cluwords_freq,
                      term_docs=cluwords_docs
                     )

In [None]:
w2v_score_1 = w2v_metric(
                        topics=topics[1], 
                        word_embedding_path="/home/felipeviegas/Codes_phd/cluhtm/wiki-news-300d-1M.vec"
                      )

## Depth 2

In [19]:
cluwords_freq, cluwords_docs, n_docs = count_tf_idf_repr(topics[2],
                                                         np.asarray(feature_names),
                                                         csr_matrix(X).transpose())

In [20]:
pmi_2, npmi_2 = pmi(topics=topics[2],
                    word_frequency=features_freq,
                    term_docs=features_docs,
                    n_docs=n_docs,
                    n_top_words=top_words)

In [None]:
coherence_2 = coherence(
                      topics=topics[2],
                      word_frequency=cluwords_freq,
                      term_docs=cluwords_docs
                     )

In [None]:
w2v_score_2 = w2v_metric(
                        topics=topics[2], 
                        word_embedding_path="/home/felipeviegas/Codes_phd/cluhtm/wiki-news-300d-1M.vec"
                      )

## Prints

In [21]:
print("NPMPI")
print(f"Depth_0 {np.mean(npmi_0)} {np.std(npmi_0, ddof=1)}")
print(f"Depth_1 {np.mean(npmi_1)} {np.std(npmi_1, ddof=1)}")
print(f"Depth_2 {np.mean(npmi_2)} {np.std(npmi_2, ddof=1)}")
print(f"Overall {np.mean(npmi_0 + npmi_1 + npmi_2)} {np.std(npmi_0 + npmi_1 + npmi_2, ddof=1)}")

Depth_0 0.4080004959011888 0.09538909777142704
Depth_1 nan nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(
  ret = ret.dtype.type(ret / rcount)


In [None]:
print("Coherence")
print(f"Depth_0 {np.mean(coherence_0)} {np.std(coherence_0, ddof=1)}")
print(f"Depth_1 {np.mean(coherence_1)} {np.std(coherence_1, ddof=1)}")
print(f"Depth_2 {np.mean(coherence_2)} {np.std(coherence_2, ddof=1)}")
print(f"Overall {np.mean(coherence_0 + coherence_1 + coherence_2)} {np.std(coherence_0 + coherence_1 + coherence_2, ddof=1)}")

In [None]:
print("W2V-L1")
print(f"Depth_0 {np.mean(w2v_score_0)} {np.std(w2v_score_0, ddof=1)}")
print(f"Depth_1 {np.mean(w2v_score_1)} {np.std(w2v_score_1, ddof=1)}")
print(f"Depth_2 {np.mean(w2v_score_2)} {np.std(w2v_score_2, ddof=1)}")
print(f"Overall {np.mean(w2v_score_0 + w2v_score_1 + w2v_score_2)} {np.std(w2v_score_0 + w2v_score_1 + w2v_score_2, ddof=1)}")