In [11]:
import os
import json
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from gensim.corpora import Dictionary
from gensim.models import KeyedVectors
from similarity_measures import compute_similarity, get_topic_topn
from dotenv import load_dotenv
from time import time

# 1. Load environment variables

In [3]:
load_dotenv()
corpus = os.getenv("CORPUS") 
model_path = os.getenv("MODEL_PATH")
similarity = os.getenv("SIMILARITY")
embeddings_path = os.getenv("EMBEDDINGS")


q = float(os.getenv("TOPIC_QUANTILE_THRESHOLD"))
topn = int(os.getenv("TOPN"))
graph_dir = os.getenv("GRAPH_PATH")

# 2. Load data

In [8]:
dict_files = sorted([file for file in os.listdir(corpus) if ".dict" in file])
models_dir = sorted(os.listdir(model_path))

epochs = range(0, len(models_dir))
data = {}
for epoch in epochs:
    # load dictionary {word->id}
    dict_path = f'{corpus}{dict_files[epoch]}'
    token2id = Dictionary.load(dict_path).token2id

    # load topics distributions
    topics_path = f'{model_path}{models_dir[epoch]}/mode-topics.dat'
    with open(topics_path, "r") as f:
        topics = np.array([[int(word) for word in line.strip().split()] for line in f])
    topics_dists = (topics.T/topics.sum(axis=1)).T

    # load word assignments and get mixture weigths
    word_assignments_path = f'{model_path}{models_dir[epoch]}/mode-word-assignments.dat'
    word_assignments = []
    with open(word_assignments_path, "r") as f:
        lines = f.readlines()[1:]
        # (doc_id, word_id, topic_id, x)
        for line in lines:
            line = line.strip().split() 
            word_assignments.append(int(line[2]))
    
    tokens = len(word_assignments)
    mixture_weights = np.unique(word_assignments, return_counts=True)[1]/tokens

    # save data in a dict
    data[epoch] = {"token2id": token2id, "topics_dists": topics_dists, "mixture_weights":mixture_weights}

In [49]:
if similarity == "wmd":
    embeddings = KeyedVectors.load_word2vec_format(embeddings_path)
else:
    embeddings = None

In [80]:
graph_path = f'{graph_dir}graph_{similarity}_q{int(100*q)}.json'
with open(graph_path, "r") as f:
    graph = json.load(f)["edges"]

# 3. Compute similarity graph

In [99]:
def build_graph(data, q): 
    graph = []
    epoch = data.keys()
    for epoch in epochs[:-1]:  
        # get vocabularies and topics distributions by epoch
        token2id1 = data[epoch]["token2id"]
        token2id2 = data[epoch+1]["token2id"]
        topics_dists1 = data[epoch]["topics_dists"] 
        topics_dists2 = data[epoch+1]["topics_dists"]
        K1 = len(topics_dists1)
        K2 = len(topics_dists2)
        # save similarities in a matrix
        similarity_matrix = np.zeros((K1, K2))
        for i in range(K1):
            topic_i = topics_dists1[i]
            for j in range(K2):
                topic_j = topics_dists2[j]
                # get similarity
                similarity_matrix[i,j] = compute_similarity(
                        similarity, 
                        embeddings, 
                        token2id1, 
                        token2id2, 
                        topic_i, 
                        topic_j, 
                        q = q
                )
                # update similarity graph with edge data
                graph.append({
                    "s": f"{epoch+1}-{i+1}", 
                    "t": f"{epoch+2}-{j+1}", 
                    "w": similarity_matrix[i,j]
                })
    
    return graph

def prunning_graph(graph, epsilon):
    prunned_graph = []
    similarities = pd.Series([edge["w"] for edge in graph])
    threshold = similarities.quantile(e)
    prunned_graph =  []
    for edge in graph:
        if edge["w"] >=  threshold:
            prunned_graph.append(edge)
    return prunned_graph

def heuristic_error(approx_graph, graph):
    approx_edges = set([(edge["s"], edge["t"]) for edge in approx_graph])
    edges = set([(edge["s"], edge["t"]) for edge in graph])
    
    error = (len(approx_edges-edges) + len(edges-approx_edges))/(len(approx_edges) + len(edges))
    return error

In [101]:
quantiles = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 1]
epsilons = [0.6, 0.7, 0.8, 0.9, 0.95, 0.99]

In [102]:
speedup = {}
speedup["q"] = quantiles
speedup["t"] = []
for e in epsilons:
    speedup[str(e)] = []
    

for q in quantiles:
    print(f"quantile: {q}")
    ti = time()
    approx_graph = build_graph(data, q)
    tf = time()
    time_execution = tf-ti
    
    speedup["t"].append(time_execution)
    
    for e in epsilons:
        approx_prunned_graph = prunning_graph(approx_graph, e)
        prunned_graph = prunning_graph(graph, e)
        error = heuristic_error(approx_prunned_graph, prunned_graph)
        
        speedup[str(e)].append(error)
    
        
df_speedup = pd.DataFrame(speedup)
max_time = df_speedup["t"].max()
df_speedup["speedup"] =  max_time/df_speedup["t"]    

quantile: 0.2
quantile: 0.3
quantile: 0.4
quantile: 0.5
quantile: 0.6
quantile: 0.7
quantile: 0.8
quantile: 0.9
quantile: 0.95
quantile: 0.99
quantile: 1


In [94]:
df_speedup

Unnamed: 0,q,t,0.6,0.7,0.8,0.9,0.95,0.99,speedup
0,0.2,2.385511,0.301205,0.298387,0.204819,0.095238,0.238095,0.2,1.467569
1,0.3,3.064166,0.198795,0.16129,0.120482,0.02381,0.190476,0.2,1.14253
2,0.4,3.500901,0.13253,0.096774,0.072289,0.02381,0.142857,0.4,1.0


In [103]:
df_speedup.to_csv("../../data/graph/speedup.csv", sep = "|", index = False)

# 4. Speedup

In [5]:
df = pd.read_csv("../../data/graph/speedup.csv", sep = "|")
df.head()

Unnamed: 0,q,t,0.6,0.7,0.8,0.9,0.95,0.99,speedup
0,0.2,2.485798,0.301205,0.298387,0.204819,0.095238,0.238095,0.2,1206.507253
1,0.3,3.087434,0.198795,0.16129,0.120482,0.02381,0.190476,0.2,971.400082
2,0.4,3.53702,0.13253,0.096774,0.072289,0.02381,0.142857,0.4,847.926641
3,0.5,5.935852,0.126506,0.08871,0.048193,0.02381,0.095238,0.0,505.257489
4,0.6,21.924542,0.10241,0.072581,0.048193,0.02381,0.095238,0.0,136.793438


In [104]:
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.update_layout(template="plotly_white")

# speedup
fig.add_trace(
    go.Scatter(x = df["q"], y = df["speedup"], name = r"speedup",
               line = dict(color = "#636EFA", width = 2), 
               marker_symbol = "circle", marker_size = 6),
    secondary_y = False
)

# error

fig.add_trace(
    go.Scatter(x = df["q"], y = 100*(1-df["0.6"]), name = r'$\zeta = 0.6$',
               line = dict(color = "#EF553B", width = 2),
               marker_symbol = "triangle-up", marker_size = 6),
    secondary_y = True
)

fig.add_trace(
    go.Scatter(x = df["q"], y = 100*(1-df["0.8"]), name = r'$\zeta = 0.8$',
               line = dict(color = '#00CC96', width = 2),
               marker_symbol = "triangle-down", marker_size = 6),
    secondary_y = True
)

fig.add_trace(
    go.Scatter(x = df["q"], y = 100*(1-df["0.9"]), name = r'$\zeta = 0.9$',
              line = dict(color = '#AB63FA', width = 2),
              marker_symbol = "asterisk-open", marker_size = 6),
    secondary_y = True
)

fig.add_trace(
    go.Scatter(x = df["q"], y = 100*(1-df["0.95"]), name = r'$\zeta = 0.95$',
               line = dict(color = '#FFA15A', width = 2),
               marker_symbol = "x", marker_size = 6),
    secondary_y = True
)

fig.add_trace(
    go.Scatter(x = df["q"], y = 100*(1-df["0.99"]), name = r'$\zeta = 0.99$',
               line = dict(color = '#FECB52', width = 2),
               marker_symbol = "cross", marker_size = 6),
    secondary_y = True
)

fig.update_xaxes(title_text="Porcentaje de la CDF utilizada del tópico")
fig.update_yaxes(type = "log", secondary_y = False) 
fig.update_yaxes(range = (0, 110), secondary_y = True)
fig.update_yaxes(title_text = "Speedup", secondary_y = False)
fig.update_yaxes(title_text = "Arcos correctos (%)", secondary_y = True)
fig.update_layout(legend = dict(yanchor="bottom", y=0.0, 
                                xanchor="left",x=0.05, 
                                bordercolor="Black", borderwidth=0.5))
fig.show() 

In [105]:
fig.write_image("../../tesis/img/ch4/speedup.eps")