In [1]:
import numpy as np
import umap.umap_ as umap
from collections import Counter
import matplotlib.pyplot as plt

from sklearn.cluster import DBSCAN

import sys
sys.path.append("/nobackup/gogandhi/alt_means_sans_k/")
from scripts.nets_and_embeddings import *

params = {
    "N": 10000,
    "k": 5,
    "maxk": 1000,
    "minc": 20,
    "maxc": 1000,
    "tau": 3.0,
    "tau2": 1.0,
    "mu": 0.2,
}
emb_params = {
    "method": "node2vec",
    "window_length": 10,
    "walk_length": 80,
    "num_walks": 10,
    "dim": 64,
}

#path_name = "/nobackup/gogandhi/alt_means_sans_k/data/experiment_changing_negative_samples"
net, community_table, emb = create_and_save_network_and_embedding(params, emb_params, path_name=None, save_file=False)
print(len(set(community_table['community_id'])))

34


In [2]:
# We are visualising how different the communities produced by
# Kmeans and the algorithm are compared to the ground truth

# First get clusters for both methods,

from scripts.similarity_scores import *
from sklearn.cluster import KMeans 


X = np.einsum("ij,i->ij", emb, 1 / np.maximum(np.linalg.norm(emb, axis=1), 1e-24))
X = emb.copy()
kmeans = KMeans(n_clusters= len(set(community_table["community_id"])), random_state=0).fit(X)
kmeans_labels = kmeans.labels_
esim_k = calc_esim(community_table["community_id"], kmeans_labels)
print(esim_k)

  super()._check_params_vs_input(X, default_n_init=10)


0.8977837361873756


In [15]:
# Untouched method to get clusters
# Increase num_neighbors to reduce communities with 1/2 members

def proposed_method_labels(emb,device_name):
        
        rpos, cpos, vpos = find_knn_edges(emb, num_neighbors=350, device = device_name) # this might involve distance metrics
        cneg = np.random.choice(emb.shape[0], len(cpos))
        vneg = np.array(np.sum(emb[rpos, :] * emb[cneg, :], axis=1)).reshape(-1)

        model = LogisticRegression()
        model.fit(
            np.concatenate([vpos, vneg]).reshape((-1, 1)),
            np.concatenate([np.ones_like(vpos), np.zeros_like(vneg)]),
                )
        w1, b0 = model.coef_[0, 0], -model.intercept_[0] 
        return louvain(emb, w1, b0, device = device_name) # this might involve distance metrics

proposed_labels = proposed_method_labels(emb,"cuda:3")
esim_p = calc_esim(community_table["community_id"], proposed_labels)
print(esim_p)

(0.9865913017586591, array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]))


In [6]:
import numpy as np
from scipy import sparse
import copy

def calc_esim(y, ypred):
    ylab, y = np.unique(y, return_inverse=True)
    ypredlab, ypred = np.unique(ypred, return_inverse=True)
    
    Ka, Kb = len(ylab), len(ypredlab)
    K = np.maximum(Ka, Kb)
    N = len(y)
    
    UA = sparse.csr_matrix((np.ones_like(y), (np.arange(y.size), y)), shape=(N,K))
    UB = sparse.csr_matrix((np.ones_like(ypred), (np.arange(ypred.size), ypred)), shape=(N, K))    
    
    nA = np.array(UA.sum(axis=0)).reshape(-1)
    nB = np.array(UB.sum(axis=0)).reshape(-1)
    nAB = (UA.T @ UB).toarray()
    nAB_rand = np.outer(nA, nB) / N
    
    Q = np.maximum(nA[:, None] @ np.ones((1, K)), np.ones((K, 1)) @ nB[None, :]) 
    Q = 1 / np.maximum(Q, 1)
    S = np.sum(np.multiply(Q, (nAB**2))) / N
   
    #Q = np.maximum(nA[:, None] @ np.ones((1, K)), np.ones((K, 1)) @ nB[None, :]) 
    #Q = 1 / np.maximum(Q, 1)
    Srand = np.sum(np.multiply(Q, (nAB_rand**2))) / N
    Scorrected = (S - Srand) / (1 - Srand)
    return Scorrected, nAB

# Example usage:
#y_true = np.array([59, 59, 101, 10, 101, 101])
#y_pred = np.array([1, 1, 2, 3, 2, 3])

y_true= copy.deepcopy(community_table['community_id'])
y_pred = copy.deepcopy(kmeans_labels)
y_prop = copy.deepcopy(proposed_labels)

esim_score, nAB = calc_esim(y_true, y_pred)
esim_p = calc_esim(community_table["community_id"], proposed_labels)

print("Element-centric similarity:", esim_score)

Element-centric similarity: 0.8977837361873756


In [43]:
# y_true= copy.deepcopy(community_table['community_id'])
# y_pred = copy.deepcopy(kmeans_labels)
# y_prop = copy.deepcopy(proposed_labels)

from scipy.spatial.distance import cosine

def metric(arrays ,metric):
    if len(arrays) ==2:
        vector1 = arrays[0]
        vector2 = arrays[1]

    if metric=='centroid':
        stacked_arrays = np.vstack(arrays)
        centroid = np.mean(stacked_arrays, axis=0)
        return centroid
    
    if metric=='euclidean':
        euclidean_distance = np.linalg.norm(vector1 - vector2)
        return euclidean_distance
    
    if metric=='cosine':
        return  1 - cosine(vector1, vector2)
    
    if metric == 'dot':
        return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
    
    

def misclassification(net, community_table, emb,y_true, y_pred, algorithm_labels, nAB):

    yclab, yc = np.unique(y_true, return_inverse=True)
    ypclab, ypc = np.unique(y_pred, return_inverse=True)
    print(len(yclab),len(ypclab))
    misclass_dict = {} # Key is ground truth label, values are predicted labels which were classified as given ground truth
    for i in range(len(nAB)):
        x = np.where(nAB[i]!=0)[0]
        if len(x)!=1:
            misclass_dict[yclab[i]] = ypclab[x]
            print(yclab[i],ypclab[x])
    #         set_combined = ()
    #         for j in ypclab[x]:
    #             set_combined = set(set_combined).union(set(list(kmeans_cluster_nodes[j][0])))
    #         print(yclab[i],ypclab[x], set(list(ground_truth_cluster_nodes[yclab[i]]))==set_combined )
    
    # For each key in dict, we take the centroid of that vectors in that cluster.
    # we do the same for each of the values. The difference comes from the nodes belonging to that cluster.
    # Once done, we can calculate euclidean distances, cosine, degree etc,
    ground_truth_cluster_nodes={}
    for key in misclass_dict.keys():
        ground_truth_cluster_nodes[key] = list(community_table[community_table['community_id']==key]['node_id'])

    algorithm_cluster_nodes={}
    for values in misclass_dict.values():
        for value in values:
            algorithm_cluster_nodes[value] = (np.where(algorithm_labels==value))
            

    ground_truth_centroids = {}
    for key in ground_truth_cluster_nodes.keys():
        vectors_of_nodes = [emb[i] for i in ground_truth_cluster_nodes[key]]
        ground_truth_centroids[key] = metric(vectors_of_nodes, 'centroid')


    algorithm_centroids={}

    for key in algorithm_cluster_nodes.keys():
        vectors_of_nodes = [emb[i] for i in algorithm_cluster_nodes[key]]
        algorithm_centroids[key] = metric(vectors_of_nodes, 'centroid')



    # Distance between the misclassified kmeans and ground truth:
    distances_mis={'euclidean':[], 'cosine':[], 'dot':[]}
    for key, values in misclass_dict.items():
        for metrics in ['euclidean','cosine','dot']:
            dists_temp = []
            for value in values:
                dists_temp.append(round(metric([ground_truth_centroids[key],algorithm_centroids[value]],metrics),4))
            distances_mis[metrics].append(dists_temp)
    for key,values in distances_mis.items():
        print(key)
        for value in values:
            print(value)
        
    for key, values in misclass_dict.items():
        temp_gt=[]
        temp_algo = []
        avg_deg_gt = round(np.mean([np.sum(net[node]) - 1 for node in ground_truth_cluster_nodes[key]]),2)
        nodes = [algorithm_cluster_nodes[value][0] for value in values]
        avg_deg_km = []
        for nodeclus in nodes:
            arr=[]
            for node in nodeclus:
                arr.append(np.sum(net[node])-1)
            avg_deg_km.append(round(np.mean(arr),2))
        #avg_deg_km = 
        print("gt: ", avg_deg_gt , "algo: ", avg_deg_km)
        temp_gt = temp_gt.append(avg_deg_gt)
        temp_algo = temp_algo.append(np.mean(avg_deg_km))

    for key, values in misclass_dict.items():
        print("gt: ", len(ground_truth_cluster_nodes[key]), "algo: ", [len(algorithm_cluster_nodes[elm][0]) for elm in values])        
    
    return distances_mis

In [44]:
y_true= copy.deepcopy(community_table['community_id'])
y_pred = copy.deepcopy(proposed_labels)
esim_score, nAB = calc_esim(y_true, y_pred)
distances = misclassification(net, community_table, emb, y_true, y_pred, proposed_labels, nAB)

319 305
4 [  9. 176. 275.]
20 [ 37. 225. 274.]
30 [193. 280.]
31 [105. 190.]
39 [ 61.  88. 135. 181. 210. 214.]
43 [122. 279.]
45 [164. 183.]
59 [103. 151. 175. 272.]
66 [ 64. 108. 114. 224.]
73 [ 20. 173. 187. 210.]
77 [17. 47.]
95 [138. 268.]
102 [ 92. 249.]
110 [ 12.  67.  70. 158. 226.]
148 [268. 298.]
161 [ 15. 129. 172.]
178 [ 35.  89. 184.]
199 [14. 65.]
203 [ 25.  28.  29.  86.  91. 101.]
230 [111. 206.]
235 [ 31. 145. 183.]
257 [ 17. 111.]
285 [ 92. 118. 166.]
297 [ 73. 157.]
315 [ 24. 268.]
euclidean
[3.4983, 3.4598, 2.4221]
[3.3985, 3.4014, 0.0525]
[3.2501, 2.2824]
[3.4161, 1.9708]
[0.0697, 3.295, 3.589, 3.6719, 3.4156, 3.5417]
[3.9307, 2.3645]
[3.5704, 1.7067]
[3.2943, 3.3765, 0.0436, 3.3212]
[3.6493, 3.3018, 0.0489, 3.2763]
[3.3951, 3.639, 0.0461, 3.4258]
[1.5174, 3.3579]
[2.1469, 3.6119]
[3.3997, 0.026]
[3.4244, 3.4985, 0.0736, 3.2313, 3.399]
[3.8174, 0.0287]
[3.6207, 3.6532, 0.0336]
[3.4961, 0.0411, 3.4533]
[3.5272, 1.9439]
[3.3078, 3.3498, 3.713, 3.3916, 3.3898, 0.0841]

In [45]:
y_true= copy.deepcopy(community_table['community_id'])
y_pred = copy.deepcopy(kmeans_labels)
esim_score, nAB = calc_esim(y_true, y_pred)
distances_km= misclassification(net, community_table, emb, y_true, y_pred, kmeans_labels, nAB)

319 319
34 [ 65 258]
54 [128 219]
56 [ 3 18]
66 [ 22  68 128 218 265 275]
119 [ 77 267]
139 [ 87 265]
153 [152 232]
159 [ 46 273]
165 [ 39 233]
171 [166 256]
177 [223 276]
193 [ 38 213]
194 [ 10 197]
196 [  6 263]
197 [ 25 297]
200 [ 97 311]
203 [32 41]
211 [239 298]
220 [ 50 303]
225 [ 58 189]
246 [115 307]
247 [111 314]
264 [154 310]
266 [ 47 295]
284 [ 22 248]
291 [ 86 318]
295 [107 242]
306 [ 68 127]
309 [ 35 130]
310 [ 26 140]
euclidean
[0.2152, 0.222]
[0.3595, 0.1742]
[0.2705, 0.1746]
[3.2251, 2.8655, 3.591, 3.3347, 3.2528, 3.7557]
[0.4094, 0.1304]
[0.1858, 0.2599]
[0.5339, 0.1016]
[0.2447, 0.2094]
[0.2179, 0.2164]
[0.2731, 0.2511]
[0.2646, 0.1923]
[0.264, 0.1886]
[0.2307, 0.2054]
[0.2711, 0.1664]
[0.2353, 0.2071]
[0.2359, 0.2088]
[2.6197, 3.3858]
[0.2534, 0.184]
[0.2205, 0.2218]
[0.2367, 0.2062]
[0.346, 0.1621]
[0.2698, 0.2086]
[0.3168, 0.1993]
[0.3115, 0.166]
[0.2725, 0.1926]
[0.2363, 0.2075]
[0.2274, 0.2595]
[2.698, 3.2035]
[0.2867, 0.1523]
[0.2375, 0.1955]
cosine
[0.9974, 0.9

In [48]:
for key in ['euclidean','cosine']:
    temp1=[]
    temp2=[]
    for value1 in (distances[key]):
        temp1.append(np.mean(value1))
    for value2 in distances_km[key]:
        temp2.append(np.mean(value2))
    print(f"Average {key} distance from ground truth for misclassified belonging to \n alt_means = {round(np.mean(temp1),3)} and kmeans = {round(np.mean(temp2),3)}")

Average euclidean distance from ground truth for misclassified belonging to 
 alt_means = 2.521 and kmeans = 0.521
Average cosine distance from ground truth for misclassified belonging to 
 alt_means = 0.639 and kmeans = 0.952


In [32]:
for value1 in distances['dot']:
    print(value1)

[0.9974, 0.9973]
[0.9939, 0.9986]
[0.996, 0.9983]
[0.5343, 0.5968, 0.4447, 0.4937, 0.5232, 0.3591]
[0.9918, 0.9992]
[0.9982, 0.9964]
[0.9862, 0.9995]
[0.9968, 0.9977]
[0.9974, 0.9974]
[0.9964, 0.9969]
[0.9962, 0.998]
[0.9961, 0.998]
[0.9971, 0.9977]
[0.9961, 0.9985]
[0.997, 0.9977]
[0.997, 0.9976]
[0.6831, 0.4525]
[0.9964, 0.9981]
[0.9974, 0.9973]
[0.997, 0.9977]
[0.9939, 0.9987]
[0.9963, 0.9978]
[0.9953, 0.9981]
[0.9949, 0.9985]
[0.9961, 0.9981]
[0.9971, 0.9978]
[0.9974, 0.9966]
[0.6403, 0.5223]
[0.9954, 0.9987]
[0.9969, 0.9979]


TypeError: 'list' object cannot be interpreted as an integer

In [52]:
# Why does Kmeans not work? Let's look at GT:  34 Kmeans: [ 65 258]
# let's visualize the vectors belonging to gt cluster 34
ground_truth_cluster_nodes[34]

NameError: name 'ground_truth_cluster_nodes' is not defined

In [51]:
print("wdwd")

wdwd
