This notebook computes the NPMI score. We added this notebook in the root path because it computes the CluWords representation while the other notebook located in the notebook's path only read the log files to create the CluWords representation.

In [1]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import re

## Load TFIDF Represantation

In [2]:
def read_input(input_path):
    arq = open(input_path, 'r', encoding="utf-8")
    doc = arq.readlines()
    arq.close()
    documents = list(map(str.rstrip, doc))
    n_documents = len(documents)
    return documents

In [3]:
def tfidf(data):
    vectorizer = TfidfVectorizer(encoding='utf-8', 
                                 analyzer='word', 
                                 max_df=1.0, 
                                 min_df=1,
                                 norm='l2', 
                                 use_idf=True, 
                                 smooth_idf=False, 
                                 sublinear_tf=True)
    X = vectorizer.fit_transform(data)
    return X, vectorizer.get_feature_names()

In [4]:
def read_tfidf(dataset_input_file):
    data = read_input(input_path=dataset_input_file)
    X, feature_names = tfidf(data=data)
    
    return X, np.asarray(feature_names)


## NPMI Methods

In [5]:
def count_tf_idf_repr(topics, cw_words, tf_idf_t):
    cw_frequency = {}
    cw_docs = {}
    for iter_topic in topics:
        for word in iter_topic:
            word_index = np.where(cw_words == word)[0]
            cw_frequency[word] = float(tf_idf_t[word_index].data.shape[0])
            cw_docs[word] = set(tf_idf_t[word_index].nonzero()[1])

    n_docs = float(tf_idf_t.data.shape[0])

    return cw_frequency, cw_docs, n_docs

In [6]:
def pmi(topics, word_frequency, term_docs, n_docs, n_top_words):
    pmi = []
    npmi = []
    n_exceptions = 0

    n_top_words = float(n_top_words)

    for t in range(len(topics)):
        top_w = topics[t]
        # top_w = topico.split(' ')

        pmi_t = 0.0
        npmi_t = 0.0

        for j in range(1, len(top_w)):
            for i in range(0, j):
                ti = top_w[i]
                tj = top_w[j]

                c_i = word_frequency[ti]
                c_j = word_frequency[tj]
                c_i_and_j = len(term_docs[ti].intersection(term_docs[tj]))
                
                try:
                    pmi_t += np.log(((c_i_and_j + 1.0) / float(n_docs)) /
                                    ((c_i * c_j) / float(n_docs) ** 2))
                except ZeroDivisionError:
                    n_exceptions += 1
                    pmi_t += .0

                npmi_t += -1.0 * np.log((c_i_and_j + 0.01) / float(n_docs))

        peso = 1.0 / (n_top_words * (n_top_words - 1.0))

        pmi.append(peso * pmi_t)
        npmi.append(pmi_t / npmi_t)

    return pmi, npmi, n_exceptions

In [7]:
def read_hierarchical(option):
    topics = {
        0: list(),
        1: list(),
        2: list()
    }
    if option == 'hpam':
        with open(hierarchical_file) as hierachical_input:
            for topic in hierachical_input:
                if topic.startswith("Super-topic"):
                    topics[1].append(topic.replace("\t", " ").strip().split(" ")[4:])
                elif topic.startswith("Root:"):
                    topics[0].append(topic.replace("]", " ").strip().split(" ")[2:])
                elif re.match("[0-9]+:", topic):
                    topics[2].append(topic.replace("\t", " ").strip().split(" ")[2:])

            hierachical_input.close()

    else:
        with open(hierarchical_file) as hierachical_input:
            for topic in hierachical_input:
                if topic.startswith("\t\t"):
                    topics[2].append(topic.replace("\t\t", "").strip().split(" "))
                elif topic.startswith("\t"):
                    topics[1].append(topic.replace("\t", "").strip().split(" "))
                else:
                    topics[0].append(topic.strip().split(" "))

            hierachical_input.close()
        
    
    return topics


## Settings

In [8]:
datasets = ['wpp', 'trip', 'acm']
# datasets = ['wpp','ang','drop','ever','face','info','pinter','trip','tweets','uber','acm','20News']
# method = 'hpam'
method = 'cw'

# 'tfidf' or 'cw'
base_npmi_score = 'cw'
hierarchical = True

In [9]:
def read_npz(npz_input_file):
    loaded  = np.load(npz_input_file)
    cluwords_repr = loaded['tfidf']
    cluwords_vocab = loaded['feature_names']
    
    return cluwords_repr, cluwords_vocab


In [10]:
from typing import Tuple

from cluwords import Cluwords, CluwordsTFIDF

def gen_cluwords(word_count: int, embedding_file_path: str, dataset: str, datasets_path: str) -> Tuple[np.array, np.array]:
    Cluwords(algorithm="knn_cosine",
                embedding_file_path=embedding_file_path,
                n_words=word_count,
                k_neighbors=500,
                threshold=0.4,
                n_jobs=4,
                dataset=dataset
    )

    cluwords = CluwordsTFIDF(
        dataset=dataset,
        dataset_file_path=datasets_path,
        n_words=word_count,
        path_to_save_cluwords=".",
        class_file_path="."
    )

    return cluwords.fit_transform(), cluwords.vocab_cluwords


In [11]:
result_scores = []
for dataset in datasets:
    total_errors = 0
    npmi_0_score = list()
    npmi_1_score = list()
    npmi_2_score = list()
    npmi_all_score = list()
#     top_sets = [5, 10, 20]
    top_sets = [10]
    
    if base_npmi_score == 'cw':
        # cw_source = '../fasttext_wiki_bert_max'
        cw_source = "fasttext_wiki"
        source_dataset = "textual_folds"
        npz_input_file = f"{cw_source}/results/{dataset}_seed-42/cluwords_representation_{dataset}.npz"
        # npz_input_file = f"{cw_source}/results/{dataset}/cluwords_representation_{dataset}.npz"
        # cluwords_repr, vocab = read_npz(npz_input_file)
        embeddings_path = f"{cw_source}/datasets/gn_w2v_models/{dataset}.txt"
        word_count = int(open(embeddings_path, "r").readline().strip().split(" ")[0])
        cluwords_repr, vocab = gen_cluwords(word_count=word_count, 
                                            embedding_file_path=embeddings_path, 
                                            dataset=dataset, 
                                            datasets_path=f"{source_dataset}/{dataset}Pre.txt")
        
    else: # 'tfidf'
        source_dataset = "textual_folds"
        dataset_input_file = f"{source_dataset}/{dataset}Pre.txt"
        cluwords_repr, vocab = read_tfidf(dataset_input_file)

    for top_words in top_sets:
        npmi_all = list()
        if method == 'cw':
            source = "fasttext_wiki"
            # source = "../fasttext_wiki_bert_max"
            # hierarchical_file = f"{source}/results/{dataset}/hierarchical_struture.txt"
            hierarchical_file = f"{source}/results/{dataset}_seed-42/hierarchical_struture.txt"
        elif method == 'bertopic':
            source = "../BertTopicResults"
            hierarchical_file = f"{source}/topic_words_{top_words}_{dataset}Pre"
        elif method == 'hpam':
            source = "../HPAMResults"
            hierarchical_file = f"{source}/{dataset}.txt"
                

        topics = read_hierarchical(method)

        words_freq, words_docs, n_docs = count_tf_idf_repr(topics[0],
                                                                 vocab,
                                                                 csr_matrix(cluwords_repr).transpose())

        pmi_0, npmi_0, errors = pmi(topics=topics[0],
                                    word_frequency=words_freq,
                                    term_docs=words_docs,
                                    n_docs=n_docs,
                                    n_top_words=top_words)
        
        if hierarchical:
            words_freq, words_docs, n_docs = count_tf_idf_repr(topics[1],
                                                                 vocab,
                                                                 csr_matrix(cluwords_repr).transpose())
            pmi_1, npmi_1, errors = pmi(topics=topics[1],
                                    word_frequency=words_freq,
                                    term_docs=words_docs,
                                    n_docs=n_docs,
                                    n_top_words=top_words)
            
            words_freq, words_docs, n_docs = count_tf_idf_repr(topics[2],
                                                                 vocab,
                                                                 csr_matrix(cluwords_repr).transpose())
            pmi_2, npmi_2, errors = pmi(topics=topics[2],
                                    word_frequency=words_freq,
                                    term_docs=words_docs,
                                    n_docs=n_docs,
                                    n_top_words=top_words)
            
        
        total_errors += errors

        npmi_0_score.append(np.mean(npmi_0)) 
        npmi_0_score.append(np.std(npmi_0, ddof=1))
        npmi_0_score.append(len(npmi_0))
        npmi_all += npmi_0
        
        if hierarchical:
            npmi_1_score.append(np.mean(npmi_1)) 
            npmi_1_score.append(np.std(npmi_1, ddof=1))
            npmi_1_score.append(len(npmi_1))
            npmi_all += npmi_1
            
            npmi_2_score.append(np.mean(npmi_2)) 
            npmi_2_score.append(np.std(npmi_2, ddof=1))
            npmi_2_score.append(len(npmi_2))
            npmi_all += npmi_2
            
            npmi_all_score.append(np.mean(npmi_all)) 
            npmi_all_score.append(np.std(npmi_all, ddof=1))
            npmi_all_score.append(len(npmi_all_score))
    
#     print(f"{dataset} {' '.join([str(score) for score in npmi_0_score])} -- {total_errors}")
    if hierarchical:
        result_scores.append(f"{dataset} {' '.join([str(score) for score in npmi_0_score])} {' '.join([str(score) for score in npmi_1_score])} {' '.join([str(score) for score in npmi_2_score])} {' '.join([str(score) for score in npmi_all_score])}")
    else:
        result_scores.append(f"{dataset} {' '.join([str(score) for score in npmi_0_score])}")


kNN...
N Threads: 4
NearestNeighbors K=500
Time 0.0023139869999795337
NN Distaces
Time 0.4293993559995215
Saving cluwords
Matrix(1523, 1523)
Number of cluwords 1523
Matrix(1523, 1523)

Computing TF...
tf shape (2956, 1523)
Cluwords TF done in 0.080s.

Computing IDF...
Read data
Time 0.033854158999929496
Dot tf and hyp_aux
Time 0.07118862900006206
Divide hyp_aux by itself
Time 0.06565197200052353
Dot tf and bin hyp_aux
Time 0.10804990900032863
Divide _dot and _dot_bin
Time 0.04342712200013921
Sum
Time 0.0016727610000089044
log
Time 6.276700059970608e-05
kNN...
N Threads: 4
NearestNeighbors K=500
Time 0.0037983800002621138
NN Distaces
Time 0.7238689619998695
Saving cluwords
Matrix(2622, 2622)
Number of cluwords 2622
Matrix(2622, 2622)

Computing TF...
tf shape (2816, 2622)
Cluwords TF done in 0.230s.

Computing IDF...
Read data
Time 0.06438148100005492
Dot tf and hyp_aux
Time 0.13713144599933003
Divide hyp_aux by itself
Time 0.08956939600011538
Dot tf and bin hyp_aux
Time 0.1764066650002

In [12]:
for result in result_scores:
    print(result)


wpp 0.9729594510966713 0.007028090616159849 5 0.9624725859577038 0.011261918607699081 59 0.9559611529171758 0.0202587039156831 238 0.9575146828519066 0.018945586738232042 2
trip 0.976717514170312 0.006624623386081015 7 0.9689831890435137 0.0162268053664875 65 0.9665342321216167 0.016753526697464563 280 0.9671889627303216 0.016571509466998504 2
acm 0.9673157227883215 0.015431302744087246 7 0.9597710458080372 0.01475043862920575 38 0.9555658916263627 0.014147945156229583 268 0.9563391973037982 0.014364098469128523 2
