In [1]:
import networkx as nx

# LOADS GRAPH FOR THE 1ST QUESTION

G = nx.read_weighted_edgelist('clustering1.txt',nodetype=int)

print('Edges: '+str(len(G.edges(data=True))))
print('Nodes: '+str(len(G.nodes(data=True))))

print("----------------------------------------")

# LOADS NODE LIST FOR THE 2ND QUESTION

a_file = open("clustering_big.txt", "r")
#a_file = open("input_random_72_16384_24.txt", "r")
NodeList = [(line.strip()).split() for line in a_file]
a_file.close()
NodeList.pop(0)
NodeList=[ list(map(int, x)) for x in NodeList]
NodeList=NodeList[:1000]
NodeList=dict(zip(range(len(NodeList)),NodeList))
print("Nodes: "+str(len(NodeList.keys())))

Edges: 124750
Nodes: 500
----------------------------------------
Nodes: 1000


In [2]:
# USEFUL FUNCTIONS FOR THE 1ST QUESTION 

def UnionFind_Initialize1(graph): 
    # This functions initializes the Union-Find structure of a given graph. Each node is a leader.
    # INPUT: Graph
    # OUTPUT: A dictionary with a key for each node. Each node is a "leader" and each value in the dictionary is a set composed of only the leader node.
    Dic={}
    for x in graph.nodes():
        Dic[x]=set([x])
    return Dic

def UnionFind_Fuse1(dic,leader1,leader2):
    #This function merges in a Union-Find two different leaders
    # INPUT: a dictionary describing the Union-Find and the two leaders to be merged.
    # OUTPUT: a dictionary with the update Union-Find
    if len(dic[leader2])>len(dic[leader1]):
        dic[leader2].update(dic[leader1])
        dic.pop(leader1)
    else:
        dic[leader1].update(dic[leader2])
        dic.pop(leader2)
    return dic

def Closest_Pair_Separated_Points1(dic,graph):
    # This function returns the shortest edge with nodes at different leaders of the Union-Find
    # INPUT: A graph and a dictionary describing the current state of the Union-Find
    # RETURNS: The shortest edge whose nodes are at different leaders of the Union-Find
    lista=[]
    for i in dic.keys():
        temp=min([x for x in graph.edges(dic[i],data=True) if ((x[0] in dic[i] and x[1] not in dic[i]) or (x[0] not in dic[i] and x[1] in dic[i]))],key=lambda x: x[2]['weight'])
        lista.append(temp)
    return min(lista,key=lambda x: x[2]['weight'])

def Find_Leaders1(edge,dic):
    # Given an edge, this function finds the leaders in the Union-Find
    # INPUT: The edge with the nodes and the current Union-Find structure
    # OUTPUT: A list with the leaders
    Leaders=[]
    for x in dic.keys():
        if (edge[0] in dic[x]) or (edge[1] in dic[x]):
            Leaders.append(x)
    return Leaders

In [3]:
# K-CLUSTERING - 1ST QUESTION

def kClustering_Maximun_Spacing(k,Graph):
    # This function finds the maximun spacing after doing the k-clustering algorithm.
    # INPUT: Number of clusters k and the Graph
    # OUTPUT: Maximun spacing after doinf the k-clustering algorithm
    UnionFind=UnionFind_Initialize1(Graph)

    while len(UnionFind.keys())>k:
        Edge=Closest_Pair_Separated_Points1(UnionFind,Graph)
        Leaders=Find_Leaders1(Edge,UnionFind)
        UnionFind=UnionFind_Fuse1(UnionFind,Leaders[0],Leaders[1])

    return int(Closest_Pair_Separated_Points1(UnionFind,Graph)[2]['weight'])

kClustering_Maximun_Spacing(4,G)

106

In [4]:
# USEFUL FUNCTIONS FOR THE 2ND QUESTION

from scipy.spatial import distance

def UnionFind_Initialize2(lista): 
    # This functions initializes the Union-Find structure for the node list. Each node is a leader.
    # INPUT: Node list
    # OUTPUT: A dictionary with a key for each node. Each node is a "leader" and each value in the dictionary is a set composed of only the leader node.
    Dic={}
    for x in range(len(lista.keys())):
        Dic[x]={x}
    return Dic

def UnionFind_Fuse2(dic,leader1,leader2):
    #This function merges in a Union-Find two different leaders
    # INPUT: a dictionary describing the Union-Find and the two leaders to be merged.
    # OUTPUT: a dictionary with the update Union-Find
    if len(dic[leader2])>len(dic[leader1]):
        dic[leader2].update(dic[leader1])
        dic.pop(leader1)
    else:
        dic[leader1].update(dic[leader2])
        dic.pop(leader2)
    return dic

def Closest_Pair_List(union_find,node_list,distancia_cutoff):
    # This function returns the shortest connection between nodes with nodes at different leaders of the Union-Find
    # INPUT: A node_list dictionary describing the network and a dictionary describing the current state of the Union-Find
    # RETURNS: The shortest Hamman connection between nodes whose at different leaders of the Union-Find. The format is (node1,node2,Hamman distance)
    m=len(node_list[0])
    Lista=[]
    for i,j in [(i,j) for i in union_find.keys() for j in union_find.keys() if abs(sum(node_list[j])-sum(node_list[i]))<distancia_cutoff and i<j]:
        Lista+=[(x,y,distance.hamming(node_list[x],node_list[y])*24) for x in union_find[i] for y in union_find[j] if distance.hamming(node_list[x],node_list[y])*m<distancia_cutoff]
    return sorted(Lista,key= lambda x: x[2])

def Find_Leaders2(edge,dic):
    # Given an edge, this function finds the leaders in the Union-Find
    # INPUT: The edge with the nodes and the current Union-Find structure
    # OUTPUT: A list with the leaders
    Leaders=[]
    for x in dic.keys():
        if (edge[0] in dic[x]) or (edge[1] in dic[x]):
            Leaders.append(x)
    return Leaders

In [5]:
# K-CLUSTERING V1 - 2ND QUESTION (USING MY UNION-FIND IMPLEMENTATION)

# This function finds the maximun k for which a certain Hamman distance is reached in the k-Clustering algorithm
# INPUT: A dictionary containing the list of nodes and the cutoff Hamman distance value required for stopping the algorithm
# OUTPUT: Returns k, the number of clusters or leaders required to reach this value

def kClustering_Findingkv1(node_list,Distance_Cutoff):

    UnionFind=UnionFind_Initialize2(node_list)
    To_Do=Closest_Pair_List(UnionFind,node_list,Distance_Cutoff)
    #print(To_Do)
    for x in To_Do:
        Leaders=Find_Leaders2(x,UnionFind)
        if len(Leaders)==2:
            UnionFind=UnionFind_Fuse2(UnionFind,Leaders[0],Leaders[1])
    return len(UnionFind.keys())

#----------------------------------------
import time
start_time = time.time()
print(kClustering_Findingkv1(NodeList,3))
print(time.time() - start_time)

989
12.652002096176147


In [6]:
# K-CLUSTERING V2 - 2ND QUESTION (USING NETWORKX'S UNION-FIND)

# This function finds the maximun k for which a certain Hamman distance is reached in the k-Clustering algorithm
# INPUT: A dictionary containing the list of nodes and the cutoff Hamman distance value required for stopping the algorithm
# OUTPUT: Returns k, the number of clusters or leaders required to reach this value

from networkx.utils.union_find import UnionFind
from scipy.spatial import distance

def kClustering_Findingkv2(node_list,Distance_Cutoff):
    nnodes=len(node_list.keys())
    nbits=len(node_list[0])
    
    Union_Find=UnionFind(node_list.keys()) # Initializes Union_Find
    
    for Distance in range(Distance_Cutoff,Distance_Cutoff+1):
        
        Leaders_List=set([Union_Find[x] for x in Union_Find]) # Obtains the leaders of Union_Find
    
        for i,j in [(i,j) for i in Leaders_List for j in Leaders_List if (i<j and abs(sum(node_list[i])-sum(node_list[j]))<Distance)]:
            temp_distance=nbits*distance.hamming(node_list[i],node_list[j])
            if temp_distance<Distance:
                #print((i,j,temp_distance))
                Union_Find.union(i,j)
                
    return len(list(Union_Find.to_sets()))

#----------------------------------------
import time
start_time = time.time()
print(kClustering_Findingkv2(NodeList,3))
print(time.time() - start_time)

989
10.186846017837524
