In [1]:
# How much do embedding algorithms agree with each other 
# distribution of distances 

In [1]:
import pickle
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import to_rgb, to_rgba
import numpy as np
from numpy.random import default_rng
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns
from scipy.stats import wasserstein_distance as EMD
import utils
import diptest
#%matplotlib inline

In [2]:
#graphNames = ["AS", "BTER-arbitrary", "BTER-from-BA", "enron", "facebook", "lastfm", "physics", "ppi", "wiki"]
#graphNames = ["wiki", "facebook", "ppi", "lastfm", "physics", "AS", "BTER-from-BA", "BTER-arbitrary"]

graph_titles = {
    "facebook": "Facebook",
    "AS": "AS",
    "BTER-arbitrary": "BTER (Arb)",
    "BTER-from-BA": "BTER (BA)",
    "enron": "Enron",
    "lastfm": "LastFM",
    "physics": "ca-HepTh",
    "ppi": "PPI",
    "wiki": "Wikipedia",
    "ER-2": "ER",
    "ER-4": "ER",
    "BA-5": "BA",
    "BA-10": "BA"
}

In [3]:
plt.rc('font', size=24)          # controls default text sizes
plt.rc('axes', titlesize=32)     # fontsize of the axes title
#plt.rc('axes', titleweight="bold")     # fontweight of the axes title
plt.rc('axes', labelsize=28)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=24)    # fontsize of the tick labels
plt.rc('ytick', labelsize=24)    # fontsize of the tick labels

In [4]:
emd_deltas_dict = {
    "Algorithm Name": [],
    "Dimension": [],
    "Graph Name": [],
    "K": [],
    "EMD Delta Normalized": []
}

In [9]:
colors = ["r", "g", "b", "c", "m", "y"]

def get_distances(embeddings_list):
    distances = []
    if len(embeddings_list) > 300:
        rng = default_rng()
        embeddings_list = rng.choice(embeddings_list, size=300, replace=False)
    for i in range(len(embeddings_list)):
        first_node = embeddings_list[i]
        for j in range(i+1, len(embeddings_list)):
            second_node = embeddings_list[j]
            distances.append(np.linalg.norm(second_node - first_node))
    return distances

def get_degenerate_subset(core, degenerateCore):
    '''
        Input: core and degenerateCore are two nx.Graph objects 
        Output: a boolean array indicating whether each node of core is in degenerateCore
    '''
    return [node in degenerateCore for node in core]

In [5]:
def degenerate_core_pairwise(embeddings, embeddings_sdne, cores, graphName):
    degenerateCore = cores[-1]
    
    algs = ["pca", "lap", "HOPE", "n2v", "hgcn", "sdne"]
    alg_titles = ["PCA", "Laplacian Eigenmap", "HOPE", "Node2Vec", "HGCN", "SDNE"]
    #algs = ["lap", "n2v"]
    #alg_titles = ["Laplacian Eigenmap", "Node2Vec"]
    
    hasSDNE = "128" in embeddings #only SDNE embeds with 100 dimensions
    
    ncols = len(algs)
    nrows = 3 #d = 2, 10, 20
    if hasSDNE:
        nrows = 4
    fig_dist, axs_dist = plt.subplots(nrows=nrows, ncols=ncols, figsize=(17*ncols, 10*nrows ))
    fig_emd, axs_emd = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15*ncols, 10*nrows ))
    
    r = -1 #column in subplot
    for d_str in ["2", "10", "20"]:
        r += 1
        if d_str not in embeddings:
            continue
        embeddings_by_alg = embeddings[d_str][1]

        for c in range(len(algs)):
            alg = algs[c]
            alg_title = alg_titles[c]
            if alg not in embeddings_by_alg:
                continue
                
            ax_dist = axs_dist[r][c]
            ax_emd = axs_emd[r][c]
            ax_emd_prev = ax_emd.twinx()
            
            first_core_embeddings = embeddings_by_alg[alg][0][1]
            first_core_distances = get_distances(
                first_core_embeddings[get_degenerate_subset(cores[0], degenerateCore)])
            prev_distances = first_core_distances
            k_s, emd_by_core, emd_prev = [], [], []
            #for core_idx in range(len(embeddings_by_alg[alg])):
            #for core_idx in [0, len(embeddings_by_alg[alg]) - 1]:
            #cores_subset = utils.get_cores_subset(cores)
            for color_idx in range(len(cores_subset)):
                k, core = cores_subset[color_idx]
                core_idx = k-1
                k, core_embeddings = embeddings_by_alg[alg][core_idx]
                #core = cores[k-1]
                distances = get_distances(core_embeddings[get_degenerate_subset(core, degenerateCore)])
                try:
                    color= (
                        #R
                        1 - core_idx/(len(embeddings_by_alg[alg])-1),\
                        #G
                        0,\
                        #B
                        core_idx/(len(embeddings_by_alg[alg])-1),\
                        #Alpha
                        0.6)
                    color = colors[color_idx]
                    sns.kdeplot(distances, ax=ax_dist, color=color, label="k="+str(k), linewidth=4)
                    
                    k_s.append(k)
                    emd_by_core.append(EMD(first_core_distances, distances))
                    emd_prev.append(EMD(prev_distances, distances))
                    prev_distances = distances
                except:
                    continue
                    
            ax_dist.set_title("Graph: {} Alg: {} d = {}".format(
                graph_titles[graphName], alg_title, d_str))
            ax_dist.set_xlabel("Pairwise Distance")
            ax_dist.legend()
            if r == 0:
                ax_dist.set_title(alg_titles[c], fontsize=72)
                
            # First Core Distribution
            emd_handle, = ax_emd.plot(k_s, emd_by_core, "b-", label='First Core')
            ax_emd.set_title("Earth Mover Distance Among Degenerate-Core Pairwise Distributions \n Algorithm: {} d = {}".format(alg, d_str))
            ax_emd.set_xlabel("k")
            ax_emd.set_ylabel("EMD from First-Core Distribution")

            # Dashed line for largest increase
            increase_by_k = np.array(emd_by_core[1:]) - np.array(emd_by_core[:-1])
            threshold = np.argmax(increase_by_k) + 1 #threshold is k before the biggest increase
            ax_emd.vlines(x=threshold,ymin=0, ymax=max(emd_by_core), color="red", linestyles="dashed")
            ax_emd.text(threshold, 0, str(threshold))
        
            # Prev Core Distribution
            prev_handle, = ax_emd_prev.plot(k_s, emd_prev, "m-", label='Previous Core')
            ax_emd_prev.set_ylabel("EMD for Previous Core Distribution")
            
            ax_emd.legend(handles=[emd_handle, prev_handle])
            
            # Save values into dict
            #increase_by_k_normalized = utils.delta(utils.normalize(emd_by_core))
            increase_by_k_normalized = utils.normalize(emd_prev)[1:]
            emd_deltas_dict["EMD Delta Normalized"].extend(increase_by_k_normalized)
            emd_deltas_dict["K"].extend(k_s[1:])
            emd_deltas_dict["Graph Name"].extend([graphName] * len(increase_by_k_normalized))
            emd_deltas_dict["Dimension"].extend([d_str] * len(increase_by_k_normalized))
            emd_deltas_dict["Algorithm Name"].extend([alg] * len(increase_by_k_normalized))
    
    #SDNE 
    if hasSDNE and False:
        alg = "sdne"
        alg_title = "SDNE"
        d_str="128"
        ax_dist = axs_dist[r + 1][0]
        #ax_dist.set_title("SDNE", fontsize=32)
        ax_dist.set_title("Graph: {} Alg: {} d = {}".format(
            graph_titles[graphName], alg_title, d_str))
        ax_dist.set_xlabel("Pairwise Distance")
        ax_dist.legend()
        embeddings_by_alg = embeddings["128"][1]
        #for core_idx in range(len(embeddings_by_alg[alg])):
        #for core_idx in [0, len(embeddings_by_alg[alg]) - 1]:
        cores_subset = utils.get_cores_subset(cores)
        for color_idx in range(len(cores_subset)):
            k, core = cores_subset[color_idx]
            core_idx = k-1
            k, core_embeddings = embeddings_by_alg[alg][core_idx]
            #core = cores[k-1]
            distances = get_distances(core_embeddings[get_degenerate_subset(core, degenerateCore)])
            try:
                color= (
                    #R
                    1 - core_idx/(len(embeddings_by_alg[alg])-1),\
                    #G
                    0,\
                    #B
                    core_idx/(len(embeddings_by_alg[alg])-1),\
                    #Alpha
                    0.6)
                color = colors[color_idx]
                sns.kdeplot(distances, ax=ax_dist, color=color, label="k="+str(k), linewidth=4)
            except:
                continue
                    
    fig_dist.suptitle(graphName)
    fig_emd.suptitle(graphName)
    
    file_title = "figs/pairwise/{}-pairwise-wide.png".format(graphName)
    fig_dist.savefig(file_title)
    file_title = "figs/pairwise/{}-emd-wide.png".format(graphName)
    fig_emd.savefig(file_title)
    
    return emd_deltas_dict

In [6]:
def pattern1(embeddings, embeddings_sdne, cores, graphName):
    degenerateCore = cores[-1]
    
    algs = ["pca", "lap", "HOPE", "n2v", "hgcn", "sdne"]
    alg_titles = ["PCA", "Laplacian Eigenmap", "HOPE", "Node2Vec", "HGCN", "SDNE"]
    #algs = ["pca", "lap", "HOPE", "n2v"]
    #alg_titles = ["PCA", "Laplacian Eigenmap", "HOPE", "Node2Vec"]
    
    hasSDNE = "128" in embeddings #only SDNE embeds with 100 dimensions
    
    ncols = 3 #d = 2, 10, 20
    d_strs = ["2", "10", "20"]
    if hasSDNE:
        ncols = 4
        d_strs = ["2", "10", "20", "128"]
    fig_emd, axs_emd = plt.subplots(ncols=ncols, figsize=(15*ncols, 10))
    fig_dip, axs_dip = plt.subplots(ncols=ncols, figsize=(15*ncols, 10))

    c = -1 #column in subplot
    for d_str in d_strs:
        c += 1
        if d_str not in embeddings:
            continue
        embeddings_by_alg = embeddings[d_str][1]

        ax_emd = axs_emd[c]
        ax_emd.set_title("Impact of removing k-shells on the \nDegenerate-Core Pairwise Distribution (DCPD) \n{} Graph d = {}".format(
                            graph_titles[graphName],
                            d_str))
        ax_emd.set_xlabel("k")
        ax_emd.set_ylabel("EMD from k-1 DCPD")
        
        ax_dip = axs_dip[c]
        ax_dip.set_title("Modality of the Degenerate-Core Pairwise Distribution\n{} Graph d = {}".format(
                            graph_titles[graphName],
                            d_str))
        ax_dip.set_xlabel("k")
        ax_dip.set_ylabel("Dip Statistic")
            
        results_emd = {"k": np.arange(1, len(cores) + 1)}
        results_dip = {"k": np.arange(1, len(cores) + 1)}
        for i in range(len(algs)):
            alg = algs[i]
            alg_title = alg_titles[i]
            if alg not in embeddings_by_alg:
                continue
            
            results_emd[alg_title] = []
            results_dip[alg_title] = []
            
            first_core_embeddings = embeddings_by_alg[alg][0][1]
            first_core_distances = get_distances(
                first_core_embeddings[get_degenerate_subset(cores[0], degenerateCore)])
            prev_distances = first_core_distances
            #for core_idx in range(len(embeddings_by_alg[alg])):
            #for core_idx in [0, len(embeddings_by_alg[alg]) - 1]:
            for core_idx in range(len(cores)):
                core = cores[core_idx]
                k, core_embeddings = embeddings_by_alg[alg][core_idx]
                assert k == core_idx + 1
                
                distances = get_distances(core_embeddings[get_degenerate_subset(core, degenerateCore)])
                results_emd[alg_title].append(EMD(distances, prev_distances))
                prev_distances = distances

                results_dip[alg_title].append(diptest.dipstat(distances))

                    
            max_emd = max(results_emd[alg_title])
            results_emd[alg_title] = [emd / max_emd for emd in results_emd[alg_title]]
                
#         sns.plot(k_s, emd_prev / np.max(emd_prev), label=alg_title, ax=ax_emd, linewidth=4)
#         sns.plot(k_s, dip_stats, label=alg_title, ax=ax_dip, linewidth=4)
        
        tick_gap = int(len(cores) / 15) + 1
        xticks = np.arange(0, len(cores) + 1, tick_gap)
        
        results_emd_pd = pd.DataFrame(results_emd).set_index("k")
        results_emd_pd.plot(kind="bar", stacked="True", ax=ax_emd)
        ax_emd.set_xticks(xticks)
        
        results_dip_pd = pd.DataFrame(results_dip).set_index("k")
        results_dip_pd.plot(kind="bar", stacked="True", ax=ax_dip)
        ax_dip.set_xticks(xticks)
        
#         ax_emd.set_xticklabels(ax.get_xticklabels(), rotation = 30)
#         ax_dip.set_xticklabels(ax.get_xticklabels(), rotation = 30)
        
        ax_emd.legend()
        ax_dip.legend()
                        
#     file_title = "figs/pairwise/{}-emd-pattern1.png".format(graphName)
#     fig_emd.savefig(file_title)
    
#     file_title = "figs/pairwise/{}-dip-pattern3.png".format(graphName)
#     fig_dip.savefig(file_title)
    
    return

## Analyze Graphs Independently

In [7]:
def analyze_embedded_cores(graphName):
    with open("embeddings_all_cores/{}_embeddings.pickle".format(graphName), "rb") as pickleFile:
            embeddings = pickle.load(pickleFile)
    with open("embeddings/{}_embeddings.pickle".format(graphName), "rb") as pickleFile:
            embeddings_sdne = pickle.load(pickleFile)
    with open("cores/{}_cores.pickle".format(graphName), "rb") as pickleFile:
            cores = pickle.load(pickleFile)
    
    #degenerate_core_pairwise(embeddings, embeddings_sdne, cores, graphName)
    pattern1(embeddings, embeddings_sdne, cores, graphName)

In [None]:
#graphNames = ["karate"]
def sdne_lambda(core, d):
    return
def hope_lambda(core, d):
    return
def lap_lambda(core, d):
    return
def n2v_lambda(core, d):
    return
def pca_lambda(core, d):
    return

#graphNames = ["BA-5"]
#graphNames = ["ER-2", "ER-4", "BA-5", "BA-10"]
#graphNames = ["AS", "BTER-arbitrary", "BTER-from-BA", "ER-2", "ER-4", "BA-5", "BA-10"]
#graphNames = ["wiki"]
#graphNames = ["facebook"]
graphNames = ["wiki", "facebook", "ppi", "physics", "lastfm", "AS", "ER-2", "ER-4", "BA-5", "BA-10", "BTER-from-BA", "BTER-arbitrary"]
for graphName in graphNames:
    analyze_embedded_cores(graphName)

In [20]:
# emd_deltas_pd = pd.DataFrame(emd_deltas_dict)
# with open("pickles/emd_deltas_pd.pickle", "wb") as pickleFile:
#     description = "EMD between DCPD before and after removal\
#     EMD Delta_k = EMD(k,k-1)"
#     pickle.dump((description, emd_deltas_pd), pickleFile)

In [3]:
# graphName='facebook'
# with open("embeddings_all_cores/{}_embeddings.pickle".format(graphName), "rb") as pickleFile:
#         embeddings = pickle.load(pickleFile)

AttributeError: 'numpy.ndarray' object has no attribute 'append'