In [1]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import re

## Load TFIDF Represantation

In [2]:
def read_input(input_path):
    arq = open(input_path, 'r', encoding="utf-8")
    doc = arq.readlines()
    arq.close()
    documents = list(map(str.rstrip, doc))
    n_documents = len(documents)
    return documents

In [3]:
def tfidf(data):
    vectorizer = TfidfVectorizer(encoding='utf-8', 
                                 analyzer='word', 
                                 max_df=1.0, 
                                 min_df=1,
                                 norm='l2', 
                                 use_idf=True, 
                                 smooth_idf=False, 
                                 sublinear_tf=True)
    X = vectorizer.fit_transform(data)
    return X, vectorizer.get_feature_names()

In [4]:
def read_tfidf(dataset_input_file):
    data = read_input(input_path=dataset_input_file)
    X, feature_names = tfidf(data=data)
    
    return X, np.asarray(feature_names)


## NPMI Methods

In [5]:
def count_tf_idf_repr(topics, cw_words, tf_idf_t):
    cw_frequency = {}
    cw_docs = {}
    for iter_topic in topics:
        for word in iter_topic:
            word_index = np.where(cw_words == word)[0]
            cw_frequency[word] = float(tf_idf_t[word_index].data.shape[0])
            cw_docs[word] = set(tf_idf_t[word_index].nonzero()[1])

    n_docs = float(tf_idf_t.data.shape[0])

    return cw_frequency, cw_docs, n_docs

In [6]:
def pmi(topics, word_frequency, term_docs, n_docs, n_top_words):
    pmi = []
    npmi = []
    n_exceptions = 0

    n_top_words = float(n_top_words)

    for t in range(len(topics)):
        top_w = topics[t]
        # top_w = topico.split(' ')

        pmi_t = 0.0
        npmi_t = 0.0

        for j in range(1, len(top_w)):
            for i in range(0, j):
                ti = top_w[i]
                tj = top_w[j]

                c_i = word_frequency[ti]
                c_j = word_frequency[tj]
                c_i_and_j = len(term_docs[ti].intersection(term_docs[tj]))
                
                try:
                    pmi_t += np.log(((c_i_and_j + 1.0) / float(n_docs)) /
                                    ((c_i * c_j) / float(n_docs) ** 2))
                except ZeroDivisionError:
                    n_exceptions += 1
                    pmi_t += .0

                npmi_t += -1.0 * np.log((c_i_and_j + 0.01) / float(n_docs))

        peso = 1.0 / (n_top_words * (n_top_words - 1.0))

        pmi.append(peso * pmi_t)
        npmi.append(pmi_t / npmi_t)

    return pmi, npmi, n_exceptions

In [7]:
def read_hierarchical(option):
    topics = {
        0: list(),
        1: list(),
        2: list()
    }
    if option == 'hpam':
        with open(hierarchical_file) as hierachical_input:
            for topic in hierachical_input:
                if topic.startswith("Super-topic"):
                    topics[1].append(topic.replace("\t", " ").strip().split(" ")[4:])
                elif topic.startswith("Root:"):
                    topics[0].append(topic.replace("]", " ").strip().split(" ")[2:])
                elif re.match("[0-9]+:", topic):
                    topics[2].append(topic.replace("\t", " ").strip().split(" ")[2:])

            hierachical_input.close()

    else:
        with open(hierarchical_file) as hierachical_input:
            for topic in hierachical_input:
                if topic.startswith("\t\t"):
                    topics[2].append(topic.replace("\t\t", "").strip().split(" "))
                elif topic.startswith("\t"):
                    topics[1].append(topic.replace("\t", "").strip().split(" "))
                else:
                    topics[0].append(topic.strip().split(" "))

            hierachical_input.close()
        
    
    return topics


## Settings

In [8]:
datasets = ['wpp','ang','drop','ever','face','info','pinter','trip','tweets','uber','acm','20News']
method = 'hpam'

# 'tfidf' or 'cw'
base_npmi_score = 'tfidf'
hierarchical = True

In [9]:
def read_npz(npz_input_file):
    loaded  = np.load(npz_input_file)
    cluwords_repr = loaded['tfidf']
    cluwords_vocab = loaded['feature_names']
    
    return cluwords_repr, cluwords_vocab


In [17]:
for dataset in datasets:
    total_errors = 0
    npmi_0_score = list()
    npmi_1_score = list()
    npmi_2_score = list()
    npmi_all_score = list()
#     top_sets = [5, 10, 20]
    top_sets = [20]
    
    if base_npmi_score == 'cw':
        cw_source = '../fasttext_wiki_bert_max'
        npz_input_file = f"{cw_source}/results/{dataset}/cluwords_representation_{dataset}.npz"
        cluwords_repr, vocab = read_npz(npz_input_file)
    else: # 'tfidf'
        source_dataset = "../textual_folds"
        dataset_input_file = f"{source_dataset}/{dataset}Pre.txt"
        cluwords_repr, vocab = read_tfidf(dataset_input_file)

    for top_words in top_sets:
        npmi_all = list()
        if method == 'cw':
            source = "../fasttext_wiki_bert_max"
            hierarchical_file = f"{source}/results/{dataset}/hierarchical_struture.txt"
        elif method == 'bertopic':
            source = "../BertTopicResults"
            hierarchical_file = f"{source}/topic_words_{top_words}_{dataset}Pre"
        elif method == 'hpam':
            source = "../HPAMResults"
            hierarchical_file = f"{source}/{dataset}.txt"
                

        topics = read_hierarchical(method)

        words_freq, words_docs, n_docs = count_tf_idf_repr(topics[0],
                                                                 vocab,
                                                                 csr_matrix(cluwords_repr).transpose())

        pmi_0, npmi_0, errors = pmi(topics=topics[0],
                                    word_frequency=words_freq,
                                    term_docs=words_docs,
                                    n_docs=n_docs,
                                    n_top_words=top_words)
        
        if hierarchical:
            words_freq, words_docs, n_docs = count_tf_idf_repr(topics[1],
                                                                 vocab,
                                                                 csr_matrix(cluwords_repr).transpose())
            pmi_1, npmi_1, errors = pmi(topics=topics[1],
                                    word_frequency=words_freq,
                                    term_docs=words_docs,
                                    n_docs=n_docs,
                                    n_top_words=top_words)
            
            words_freq, words_docs, n_docs = count_tf_idf_repr(topics[2],
                                                                 vocab,
                                                                 csr_matrix(cluwords_repr).transpose())
            pmi_2, npmi_2, errors = pmi(topics=topics[2],
                                    word_frequency=words_freq,
                                    term_docs=words_docs,
                                    n_docs=n_docs,
                                    n_top_words=top_words)
            
        
        total_errors += errors

        npmi_0_score.append(np.mean(npmi_0)) 
        npmi_0_score.append(np.std(npmi_0, ddof=1))
        npmi_all += npmi_0
        
        if hierarchical:
            npmi_1_score.append(np.mean(npmi_1)) 
            npmi_1_score.append(np.std(npmi_1, ddof=1))
            npmi_all += npmi_1
            
            npmi_2_score.append(np.mean(npmi_2)) 
            npmi_2_score.append(np.std(npmi_2, ddof=1))
            npmi_all += npmi_2
            
            npmi_all_score.append(np.mean(npmi_all)) 
            npmi_all_score.append(np.std(npmi_all, ddof=1))
    
#     print(f"{dataset} {' '.join([str(score) for score in npmi_0_score])} -- {total_errors}")
    if hierarchical:
        print(f"{dataset} {' '.join([str(score) for score in npmi_0_score])} {' '.join([str(score) for score in npmi_1_score])} {' '.join([str(score) for score in npmi_2_score])} {' '.join([str(score) for score in npmi_all_score])}")
    else:
        print(f"{dataset} {' '.join([str(score) for score in npmi_0_score])}")


wpp 0.3039133354432548 0.025572339326475266 0.5259673131595624 0.0656032260834207 0.3838258845871712 0.043449538335606504 0.39591148581762164 0.06183071876434732
ang 0.35387565537057786 0.013446198654510135 0.5538370418113486 0.05116199635023758 0.4053416471694636 0.05048801524652644 0.41825595306694074 0.06615552691574668
drop 0.39702958912934716 0.006622662198203445 0.47450989946577243 0.04954394881850429 0.3773078380728068 0.0322079526117803 0.3862424539735834 0.04392429301723971
ever 0.37692863547313904 0.005476569545040999 0.39603548367315966 0.032748926039161734 0.3630324715299916 0.0264015047621506 0.3661309065333685 0.02853302656007675
face 0.3547297135295019 0.008090634614931678 0.33093371581143355 0.040768944163583724 0.28869201717522075 0.03762620721132944 0.29309250981230545 0.04006120177501992
info 0.6520694296811937 0.004294581820931727 0.4958486403284576 0.030765664376022933 0.5300033349560299 0.03783022501778958 0.5280260299871059 0.0401169984840387
pinter 0.35068861090