Notebook to study the CluWords representation. There are some plots to understand the co-occurrence among the features created by the CluWords representation in contrast to the TFIDF representation. 
This notebook is located in the root path because it run the CluWords representation.

In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import networkx as nx

%matplotlib inline
%reload_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore") 

## Load TFIDF Represantation

In [2]:
def read_input(input_path):
    arq = open(input_path, 'r', encoding="utf-8")
    doc = arq.readlines()
    arq.close()
    documents = list(map(str.rstrip, doc))
    n_documents = len(documents)
    return documents

In [3]:
def tfidf(data):
    vectorizer = TfidfVectorizer(encoding='utf-8', 
                                 analyzer='word', 
                                 max_df=1.0, 
                                 min_df=1,
                                 norm='l2', 
                                 use_idf=True, 
                                 smooth_idf=False, 
                                 sublinear_tf=True)
    X = vectorizer.fit_transform(data)
    return X, vectorizer.get_feature_names()

In [4]:
def read_tfidf(dataset_input_file):
    data = read_input(input_path=dataset_input_file)
    X, feature_names = tfidf(data=data)
    
    return X, np.asarray(feature_names)


## Settings

In [5]:
datasets = ['wpp']
# datasets = ['wpp','ang','drop','ever','face','info','pinter','trip','tweets','uber','acm','20News']
# method = 'hpam'
method = 'cw'

# 'tfidf' or 'cw'
base_npmi_score = 'cw'
hierarchical = True

In [6]:
def read_npz(npz_input_file):
    loaded  = np.load(npz_input_file)
    cluwords_repr = loaded['tfidf']
    cluwords_vocab = loaded['feature_names']
    
    return cluwords_repr, cluwords_vocab


In [7]:
from typing import Tuple

from cluwords import Cluwords, CluwordsTFIDF

def gen_cluwords(word_count: int, embedding_file_path: str, dataset: str, datasets_path: str) -> Tuple[np.array, np.array]:
    Cluwords(algorithm="knn_cosine",
                embedding_file_path=embedding_file_path,
                n_words=word_count,
                k_neighbors=500,
                threshold=0.4,
                n_jobs=4,
                dataset=dataset
    )

    cluwords = CluwordsTFIDF(
        dataset=dataset,
        dataset_file_path=datasets_path,
        n_words=word_count,
        path_to_save_cluwords=".",
        class_file_path="."
    )

    return cluwords.fit_transform(), cluwords.vocab_cluwords


## NetworkX

In [8]:
from matplotlib import pylab
import networkx as nx

def save_graph(graph,file_name):
    #initialze Figure
    plt.figure(num=None, figsize=(20, 20), dpi=80)
    plt.axis('off')
    fig = plt.figure(1)
    pos = nx.spring_layout(graph, k=0.5, iterations=20)
    colors = [node[1]['color'] for node in graph.nodes(data=True)]
    nx.draw_networkx_nodes(graph,pos, node_color=colors, node_size=200)
    nx.draw_networkx_edges(graph,pos)
    # nx.draw_networkx_labels(graph,pos)

    plt.savefig(file_name,bbox_inches="tight")
    pylab.close()
    plt.close()
    del fig

In [22]:
import seaborn as sns
import matplotlib.pyplot as plt

def barplot(filename, df, y):
    plt.figure(figsize=(20, 20), dpi=80)
    sns.barplot(data=df, x="doc_id", y=y)
    # plt.ylabel('Value')
    # plt.xlabel('Index')
    # plt.grid()
    # # plt.legend(handles=legend_handles, bbox_to_anchor=(1.02, 1), title='Column')
    # plt.tight_layout()
    plt.savefig(filename,bbox_inches="tight")
    pylab.close()
    plt.close()

## Co-occurrence

**tfidf**

In [9]:
dataset = datasets[0]
source_dataset = "textual_folds"
dataset_input_file = f"{source_dataset}/{dataset}Pre.txt"
tfidf_repr, vocab_tf_idf = read_tfidf(dataset_input_file)

In [10]:
top_words = 5
max_indexes = np.argsort(np.asarray(tfidf_repr.sum(axis=0)).flatten())[::-1]
np.asarray(vocab_tf_idf)[max_indexes]

array(['application', 'update', 'messenger', ..., 'withdraw', 'setup',
       'manually'], dtype='<U15')

In [11]:
co_occ_tfidf = tfidf_repr.transpose().dot(tfidf_repr)
bin_co_occ_tfidf = (co_occ_tfidf>0)*1

G = nx.Graph()

for x in max_indexes[:top_words]:
    G.add_node(vocab_tf_idf[x], color='orange')
    for y in range(bin_co_occ_tfidf.shape[1]):
        if bin_co_occ_tfidf[x, y] and x != y :
            if not vocab_tf_idf[y] in vocab_tf_idf[max_indexes[:top_words]]:
                G.add_node(vocab_tf_idf[y], color='blue')
                
            G.add_edge(vocab_tf_idf[x], vocab_tf_idf[y], weight=4)

save_graph(G, "graph_tfidf.pdf")


**CluWords**

In [12]:
cw_source = "fasttext_wiki-bert_concat-using-distinct-seeds"
source_dataset = "textual_folds"
npz_input_file = f"{cw_source}/results/{dataset}_seed-42/cluwords_representation_{dataset}.npz"
# npz_input_file = f"{cw_source}/results/{dataset}/cluwords_representation_{dataset}.npz"
# cluwords_repr, vocab = read_npz(npz_input_file)
embeddings_path = f"{cw_source}/datasets/gn_w2v_models/{dataset}.txt"
word_count = int(open(embeddings_path, "r").readline().strip().split(" ")[0])
cluwords_repr, vocab_cw = gen_cluwords(word_count=word_count, 
                                    embedding_file_path=embeddings_path, 
                                    dataset=dataset, 
                                    datasets_path=f"{source_dataset}/{dataset}Pre.txt")

kNN...
N Threads: 4
NearestNeighbors K=500
Time 0.005459096000038244
NN Distaces
Time 0.3758323150000251
Saving cluwords
Matrix(1523, 1523)
Number of cluwords 1523
Matrix(1523, 1523)

Computing TF...
tf shape (2956, 1523)
Cluwords TF done in 0.082s.

Computing IDF...
Read data
Time 0.03401001299999962
Dot tf and hyp_aux
Time 0.045774392000055286
Divide hyp_aux by itself
Time 0.026231893999977274
Dot tf and bin hyp_aux
Time 0.04536375100002488
Divide _dot and _dot_bin
Time 0.04238399799999115
Sum
Time 0.001355369000009432
log
Time 5.123400001139089e-05


In [13]:
max_indexes_cw = []
for idx in range(vocab_cw.shape[0]):
    if vocab_cw[idx] in vocab_tf_idf[max_indexes[:top_words]]:
        max_indexes_cw.append(idx)

max_indexes_cw = np.asarray(max_indexes_cw)

vocab_cw[max_indexes_cw], vocab_tf_idf[max_indexes[:top_words]]

(array(['application', 'chat', 'download', 'messenger', 'update'],
       dtype='<U14'),
 array(['application', 'update', 'messenger', 'chat', 'download'],
       dtype='<U15'))

In [14]:
co_occ_cluwords = np.dot(cluwords_repr.T, cluwords_repr)
bin_co_occu_cluwords = (co_occ_cluwords>0)*1

G_cw = nx.Graph()

for x in max_indexes_cw: 
    G_cw.add_node(vocab_cw[x], color='orange')
    for y in range(bin_co_occu_cluwords.shape[1]):
        if bin_co_occu_cluwords[x, y] and x != y:
            if not vocab_cw[y] in vocab_cw[max_indexes_cw]:
                G_cw.add_node(vocab_cw[y], color='blue')
                
            G_cw.add_edge(vocab_cw[x], vocab_cw[y], weight=4)

save_graph(G_cw, "graph_cw.pdf")            

In [21]:
import pandas as pd 

bin_cluwords_repr = (cluwords_repr > 0) * 1
bin_tfidf_repr = (tfidf_repr > 0) * 1
sum_tokens_cw = bin_cluwords_repr.sum(axis=1)
sum_tokens_tfidf = np.asarray(bin_tfidf_repr.sum(axis=1)).flatten()

pd_cw = pd.DataFrame({"doc_id": np.arange(sum_tokens_cw.shape[0]), "cw_count": sum_tokens_cw, "tfidf_count": sum_tokens_tfidf})


In [23]:
barplot("bar_plot_tfidf.pdf", pd_cw, "tfidf_count")

In [24]:
barplot("bar_plot_cw.pdf", pd_cw, "cw_count")

Checking if Norm impacts the cosine similarity

In [76]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

file = open("/home/local/FARFETCH/felipe.viegas/repo/cluhtm/fasttext_wiki-bert_concat-using-distinct-seeds/datasets/gn_w2v_models/wpp.txt", "r")
file.readline()

embeddings = []
for line in file:
    features = line.strip().split(" ")[1:]
    embedding = [dim for dim in features]
    embeddings.append(embedding)

embeddings = np.asarray(embeddings, dtype=np.float32)

#Normalization
norm_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)


# Pairwise cosine similarity
cos_sim = cosine_similarity(embeddings)
cos_sim_norm = cosine_similarity(norm_embeddings)

for emb, n_emb in zip(cos_sim.round(decimals=1), cos_sim_norm.round(decimals=1)):
    if not np.array_equal(emb, n_emb):
        print("----------------------------------")
        print(emb)
        print(n_emb)
        print("----------------------------------")

The normalization does not affect the cosine similarity distribution, but it will affect its precision. 