In [3]:
import os
import pickle as pkl
import numpy as np
import pandas as pd
import logging
import re
import plotly.graph_objects as go
from time import time
from gensim.corpora import Dictionary
from gensim.models.fasttext import load_facebook_vectors
from similarity_measures import compute_similarity
from dotenv import load_dotenv
from scipy.sparse import csc_matrix
from sklearn.metrics import recall_score, precision_score, f1_score, classification_report
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings("ignore")

In [4]:
# load environment variables
load_dotenv()

True

# Load Data

In [19]:
corpus_dir = f'{os.getenv("CORPUS")}{os.getenv("SLICE_TYPE")}'
corpus_files = sorted([file for file in os.listdir(corpus_dir) if bool(re.match("corpus_\d*.mm$", file))])
dict_files = sorted([file for file in os.listdir(corpus_dir) if ".dict" in file])
models_path = f'{os.getenv("RESULTS")}hdp/{os.getenv("SLICE_TYPE")}'
models_dir = sorted(os.listdir(models_path))

slices = range(1, len(models_dir)+1)
data = {}
for slice in slices:
    # load dictionary {word->id}
    dict_path = f'{corpus_dir}/{dict_files[slice-1]}'
    token2id = Dictionary.load(dict_path).token2id
    
    # load topics distributions
    topics_path = f'{models_path}/{models_dir[slice-1]}/mode-topics.dat'
    with open(topics_path, "r") as f:
        topics = np.array([[int(word) for word in line.strip().split()] for line in f])
    topics_dists = (topics.T/topics.sum(axis=1)).T

    # save data in a dict
    data[slice] = {"token2id": token2id, "topics_dists": topics_dists}

# Build Similarity graph

In [78]:
def get_sparse_matrix(data, size):
    sparse_matrix = csc_matrix(size, dtype=np.int8)
    for (i,j) in data:
        sparse_matrix[i-1,j-1]=1
    return sparse_matrix


def build_similarity_graph(data, q):
    similarity_graph = []
    for slice in slices[:-1]:
        token2id1 = data[slice]["token2id"]
        token2id2 = data[slice+1]["token2id"]
        topics_dists1 = data[slice]["topics_dists"] 
        topics_dists2 = data[slice+1]["topics_dists"]
        K1 = len(topics_dists1)
        K2 = len(topics_dists2)
        similarity_matrix = np.zeros((K1, K2))
        for i in range(K1):
            topic_i = topics_dists1[i]
            for j in range(K2):
                topic_j = topics_dists2[j]
                similarity_matrix[i,j] = compute_similarity(os.getenv("SIMILARITY"), embeddings, 
                token2id1, token2id2, topic_i, topic_j, 
                q = q)                                          
        similarity_graph.append(similarity_matrix)
    return similarity_graph


def pruning_graph(graph, zeta):
    edges_dist = np.concatenate([matrix.flatten() for matrix in graph])
    threshold = np.quantile(edges_dist, zeta)
    pruned_graph = [csc_matrix((matrix>=threshold).astype(int)) for matrix in graph]
    return pruned_graph

def metrics(ground_truth, prediction):
    N = len(ground_truth)
    y_true = np.concatenate([ground_truth[i].toarray().flatten() for i in range(N)])
    y_pred = np.concatenate([prediction[i].toarray().flatten() for i in range(N)])
    report = classification_report(y_true, y_pred, output_dict=True)
    report = pd.DataFrame(performance).transpose()
    report.reset_index(inplace=True)
    return report

In [None]:
# ground truth
labels = [[(1,4), (2,2), (3,2), (4,1), (5,5), (6,3), (7,2)],
[(1,4), (2,2), (2,3), (3,5), (4,1), (5,6)],
[(1,4), (2,5), (3,2), (4,5), (5,1), (6,6)],
[(1,2), (2,4), (4,5), (5,4), (5,7), (6,3)],
[(1,3), (2,2), (3,7), (4,5), (5,6), (7,1)]]
ground_truth = []
for slice in slices[:-1]:
    K1 = len(data[slice]["topics_dists"])
    K2 = len(data[slice+1]["topics_dists"])
    sparse_matrix = get_sparse_matrix(labels[slice-1], (K1, K2))
    ground_truth.append(sparse_matrix)

# save ground truth
ground_truth_path = f'{os.getenv("RESULTS")}graph/ground_truth.pkl'
with open(ground_truth_path, "wb") as f:
    pkl.dump(ground_truth, f, pkl.HIGHEST_PROTOCOL)

In [None]:
# select a similarity measure
os.environ["SIMILARITY"] = "cs"

if os.getenv("SIMILARITY") == "wmd":
    embeddings = load_facebook_vectors(os.getenv("EMBEDDINGS"))
else:
    embeddings = None

In [81]:
# build similarity graph for different q values and get performance for different zeta
grid = {"q": [0.2, 0.4, 0.6, 0.8, 0.9, 0.95], "zeta": np.arange(0.05, 1, 0.05).round(2)}
graphs = []
performance = []
for q in grid["q"]:
    print(f"q:{q}")
    ti = time()
    similarity_graph = build_similarity_graph(data, q)
    tf = time()
    delta_time = round(tf-ti)
    for zeta in grid["zeta"]:
        prediction = pruning_graph(similarity_graph, zeta)
        report = metrics(ground_truth, prediction)
        report["time[s]"] = delta_time
        report["q"] = q
        performance.append(report)
    graphs.append({"q":q, "time[s]":delta_time ,"graph":similarity_graph})
df_performance = pd.concat(performance)
df_performance.reset_index(inplace=True, drop=True)

q:0.2
q:0.4
q:0.6
q:0.8
q:0.9
q:0.95


In [82]:
# save
graph_path = f'{os.getenv("RESULTS")}graph/graph_{os.getenv("SLICE_TYPE")}_{os.getenv("SIMILARITY")}.pkl'
with open(graph_path, "wb") as f:
    pkl.dump(graphs, f, pkl.HIGHEST_PROTOCOL)
    
df_path = f'{os.getenv("RESULTS")}graph/performance_{os.getenv("SLICE_TYPE")}_{os.getenv("SIMILARITY")}.csv'
df_performance.to_csv(df_path, sep="|", index=False)

In [50]:
# load results
graph_path = f'{os.getenv("RESULTS")}graph/graph_{os.getenv("SLICE_TYPE")}_{os.getenv("SIMILARITY")}.pkl'
with open(graph_path, "rb") as f:
    graphs = pkl.load(f)
df_path = f'{os.getenv("RESULTS")}graph/performance_{os.getenv("SLICE_TYPE")}_{os.getenv("SIMILARITY")}.csv'
df_performance = pd.read_csv(df_path, sep="|")

In [73]:
def performance_plot(df):
    q =  df["q"].unique()
    x =  df["zeta"].unique()
    y1 = df[df["q"]==0.2]["f-score"]
    y2 = df[df["q"]==0.4]["f-score"]
    y3 = df[df["q"]==0.6]["f-score"]
    y4 = df[df["q"]==0.8]["f-score"]
    y5 = df[df["q"]==0.95]["f-score"]

    fig = go.Figure()
    fig.update_layout(template="plotly_white")

    # lambda

    fig.add_trace(go.Scatter(x=x, y=y1, name = 'q=0.2',
                             marker=dict(size=4), line=dict(color='royalblue', width=2, dash='dot')))
    fig.add_trace(go.Scatter(x=x, y=y2, name='q=0.4', 
                             marker=dict(size=4), line=dict(color='green', width=2, dash='dot')))
    fig.add_trace(go.Scatter(x=x, y=y3, name='q=0.6', 
                             marker=dict(size=4), line=dict(color='yellow', width=2, dash='dot')))

    fig.add_trace(go.Scatter(x=x, y=y4, name='q=0.8', 
                             marker=dict(size=4), line=dict(color='orange', width=2, dash='dot')))
    fig.add_trace(go.Scatter(x=x, y=y5, name='q=1.0', 
                             marker=dict(size=4), line=dict(color='firebrick', width=2, dash='dot')))
 
    fig.update_xaxes(nticks=int(len(x)/2), title_text=f"zeta")
    
    fig.update_yaxes(nticks=10, range=[0,1], title_text=f"f-score")
    return fig

In [83]:
# wmd
fig = performance_plot(df_performance)
fig.show()

In [74]:
# cs
fig = performance_plot(df_performance)
fig.show()

In [46]:
# js
fig = performance_plot(df_performance)
fig.show()

In [693]:
df_max = df_performance.groupby(["q", "alpha"]).agg({"f-score":"idxmax"})
df_max["zeta"] = list(df_performance["zeta"].iloc[df_max["f-score"].values])
df_max["recall"] = list(df_performance["recall"].iloc[df_max["f-score"].values])
df_max["precision"] = list(df_performance["precision"].iloc[df_max["f-score"].values])
df_max["f-score"] = list(df_performance["f-score"].iloc[df_max["f-score"].values])
df_max

Unnamed: 0_level_0,Unnamed: 1_level_0,f-score,zeta,recall,precision
q,alpha,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.2,0.2,0.72,0.95,0.580645,0.947368
0.2,0.4,0.76,0.95,0.612903,1.0
0.2,0.6,0.76,0.95,0.612903,1.0
0.2,0.8,0.782609,0.9,0.870968,0.710526
0.2,1.0,0.782609,0.9,0.870968,0.710526
0.4,0.2,0.695652,0.9,0.774194,0.631579
0.4,0.4,0.72,0.95,0.580645,0.947368
0.4,0.6,0.76,0.95,0.612903,1.0
0.4,0.8,0.76,0.95,0.612903,1.0
0.4,1.0,0.76,0.95,0.612903,1.0


In [713]:
df_performance.groupby("q")["time"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
q,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.2,95.0,46.6,0.492497,46.0,46.0,47.0,47.0,47.0
0.4,95.0,60.0,1.101257,58.0,60.0,60.0,61.0,61.0
0.6,95.0,75.2,0.984994,74.0,75.0,75.0,75.0,77.0
0.8,95.0,464.6,99.902442,335.0,368.0,476.0,567.0,577.0
0.9,95.0,2983.4,235.63253,2619.0,2827.0,3027.0,3193.0,3251.0
0.95,95.0,8565.8,971.893968,7172.0,7671.0,8983.0,9453.0,9550.0


In [723]:
df_speedup = df_performance.groupby("q").agg({"time":"mean"})
df_speedup["speedup"] = df_speedup["time"].max()/df_speedup["time"]
df_speedup

Unnamed: 0_level_0,time,speedup
q,Unnamed: 1_level_1,Unnamed: 2_level_1
0.2,46.6,183.815451
0.4,60.0,142.763333
0.6,75.2,113.906915
0.8,464.6,18.436935
0.9,2983.4,2.871154
0.95,8565.8,1.0


In [715]:
fig = go.Figure()
fig.update_layout(template="plotly_white")
fig.add_trace(go.Scatter(
    x=df_speedup.index.values,
    y=df_speedup.speedup.values
))
fig.update_xaxes(title_text="q")
fig.update_yaxes(title_text="speedup")
#fig.update_layout(yaxis_type="log")
fig.show()