In [108]:
import os
import pickle as pkl
import numpy as np
import pandas as pd
import logging
import re
from time import time
from gensim.corpora import Dictionary
from gensim.corpora.bleicorpus import BleiCorpus
from gensim.models.fasttext import load_facebook_vectors
from wmd import wmd
from dotenv import load_dotenv
from scipy.sparse import csc_matrix
from sklearn.metrics import recall_score, precision_score, f1_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
# load environment variables
load_dotenv()

True

# Load Data

In [3]:
corpus_dir = f'{os.getenv("CORPUS")}{os.getenv("SLICE_TYPE")}'
corpus_files = sorted([file for file in os.listdir(corpus_dir) if bool(re.match("corpus_\d*.mm$", file))])
dict_files = sorted([file for file in os.listdir(corpus_dir) if ".dict" in file])
models_path = f'{os.getenv("RESULTS")}hdp/{os.getenv("SLICE_TYPE")}'
models_dir = sorted(os.listdir(models_path))

slices = range(1, len(models_dir)+1)
data = {}
for slice in slices:
    # load dictionary {word->id}
    dict_path = f'{corpus_dir}/{dict_files[slice-1]}'
    token2id = Dictionary.load(dict_path).token2id

    # get term probability
    corpus_path = f'{corpus_dir}/{corpus_files[slice-1]}'
    corpus = BleiCorpus(corpus_path)
    term_frequency = np.zeros(len(token2id))
    for doc in corpus:
        for (id_word, freq) in doc:
            term_frequency[id_word]+=freq
    term_probability = term_frequency/term_frequency.sum()
    
    # load topics distributions
    topics_path = f'{models_path}/{models_dir[slice-1]}/mode-topics.dat'
    with open(topics_path, "r") as f:
        topics = np.array([[int(word) for word in line.strip().split()] for line in f])
    topics_dists = (topics.T/topics.sum(axis=1)).T

    # save data in a dict
    data[slice] = {"token2id": token2id, "topics_dists": topics_dists, "term_probability": term_probability}

In [4]:
embeddings = load_facebook_vectors(os.getenv("EMBEDDINGS"))

In [152]:
def get_sparse_matrix(data, size):
    sparse_matrix = csc_matrix(size, dtype=np.int8)
    for (i,j) in data:
        sparse_matrix[i-1,j-1]=1
    return sparse_matrix

In [153]:
labels = [[(1,4), (2,2), (3,2), (4,1), (5,5), (6,3), (7,2)],
[(1,4), (2,2), (2,3), (3,5), (4,1), (5,6)],
[(1,4), (2,5), (3,2), (4,5), (5,1), (6,6)],
[(1,2), (2,4), (4,5), (5,4), (5,7), (6,3)],
[(1,3), (2,2), (3,7), (4,5), (5,6), (7,1)]]
ground_truth = []
for slice in slices[:-1]:
    K1 = len(data[slice]["topics_dists"])
    K2 = len(data[slice+1]["topics_dists"])
    sparse_matrix = get_sparse_matrix(labels[slice-1], (K1, K2))
    ground_truth.append(sparse_matrix)

In [154]:
def build_similarity_graph(data, q, alpha):
    similarity_graph = []
    for slice in slices[:-1]:
        token2id1 = data[slice]["token2id"]
        token2id2 = data[slice+1]["token2id"]
        topics_dists1 = data[slice]["topics_dists"] 
        topics_dists2 = data[slice+1]["topics_dists"]
        termp1 = data[slice]["term_probability"]
        termp2 = data[slice+1]["term_probability"]
        K1 = len(topics_dists1)
        K2 = len(topics_dists2)
        similarity_matrix = np.zeros((K1, K2))
        for i in range(K1):
            topic_i = topics_dists1[i]
            for j in range(K2):
                topic_j = topics_dists2[j]
                distance = wmd(embeddings, token2id1, token2id2, topic_i, topic_j, 
                               termp1, termp2, q=q, alpha=alpha)
                similarity = 1/(1+distance)
                similarity_matrix[i,j] = similarity
        similarity_graph.append(similarity_matrix)
    return similarity_graph

In [155]:
def pruning_graph(graph, zeta):
    edges_dist = np.concatenate([matrix.flatten() for matrix in graph])
    threshold = np.quantile(edges_dist, zeta)
    pruned_graph = [csc_matrix((matrix>threshold).astype(int)) for matrix in graph]
    return pruned_graph

In [156]:
def metrics(ground_truth, prediction):
    N = len(ground_truth)
    y_true = np.concatenate([ground_truth[i].toarray().flatten() for i in range(N)])
    y_pred = np.concatenate([prediction[i].toarray().flatten() for i in range(N)])
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    f_score = f1_score(y_true, y_pred)
    return recall, precision, f_score

In [163]:
similarity_graph = build_similarity_graph(data, q=0.8, alpha=0.6)

In [164]:
[metrics(ground_truth, pruning_graph(similarity_graph, zeta)) for zeta in np.arange(0.05, 1, 0.05).round(2)]

[(1.0, 0.08635097493036212, 0.15897435897435896),
 (1.0, 0.09117647058823529, 0.16711590296495957),
 (1.0, 0.09657320872274143, 0.17613636363636365),
 (1.0, 0.10264900662251655, 0.18618618618618618),
 (1.0, 0.10954063604240283, 0.19745222929936307),
 (1.0, 0.11742424242424243, 0.21016949152542375),
 (1.0, 0.12601626016260162, 0.22382671480144403),
 (1.0, 0.13656387665198239, 0.24031007751937986),
 (0.967741935483871, 0.14423076923076922, 0.2510460251046025),
 (0.967741935483871, 0.15873015873015872, 0.2727272727272727),
 (0.967741935483871, 0.17647058823529413, 0.2985074626865672),
 (0.967741935483871, 0.1986754966887417, 0.3296703296703297),
 (0.967741935483871, 0.22727272727272727, 0.36809815950920244),
 (0.967741935483871, 0.2631578947368421, 0.4137931034482759),
 (0.967741935483871, 0.3157894736842105, 0.47619047619047616),
 (0.967741935483871, 0.39473684210526316, 0.5607476635514018),
 (0.9032258064516129, 0.49122807017543857, 0.6363636363636364),
 (0.8064516129032258, 0.657894736

In [None]:
grid = {"q": [0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99, 1.0], "alpha":np.linspace(0.2,1,5).round(1), "zeta": np.arange(0.05, 1, 0.05).round(2)}
graphs = []
performance = []
for q in grid["q"]:
    for alpha in grid["alpha"]:
        print(f"q:{q}, alpha: {alpha}")
        ti = time()
        similarity_graph = build_similarity_graph(data, q, alpha)
        tf = time()
        delta_time = round(tf-ti)
        for zeta in grid["zeta"]:
            prediction = pruning_graph(similarity_graph, zeta)
            recall, precision, f_score = metrics(ground_truth, prediction)
            performance.append([q, alpha, delta_time, zeta, recall, precision, f_score])
        graphs.append({"q":q, "alpha":alpha, "time[s]":delta_time ,"graph":similarity_graph})
df_performance = pd.DataFrame(results, columns = ["q", "alpha", "zeta", "recall", "precision", "f-score"])

q:0.2, alpha: 0.2


In [None]:
# save results
ground_truth_path = f'{os.getenv("RESULTS")}graph/ground_truth.pkl'
with open(ground_truth_path, "wb") as f:
    pkl.dump(ground_truth, f, pkl.HIGHEST_PROTOCOL)

graph_path = f'{os.getenv("RESULTS")}graph/graphs.pkl'
with open(graph_path, "wb") as f:
    pkl.dump(graphs, f, pkl.HIGHEST_PROTOCOL)
    
df_path = f'{os.getenv("RESULTS")}graph/performance.csv'
df_performance.to_csv(    , sep="|", index=False)

In [122]:
df_results = pd.DataFrame(columns = ["q", "alpha", "zeta", "recall", "precision", "f-score"])