In [1]:
# LOADS NODE LIST FOR THE 2ND QUESTION

# Reads the file and puts the binary code of each node in a list called txt. Each entry of txt is a 24-bit binary 
# string corresponding to a node.

from pathlib import Path
txt = Path('clustering_big.txt').read_text()
txt=txt.split('\n')
txt.pop(0)
txt.pop(200000)
txt=[x.replace(' ', '') for x in txt]
len(txt)

200000

In [2]:
# ALL UNIQUE BIT SEQUENCES

# Creates a dictionary called NodeList that has for keys the integer representation of the binary code of the node.
# The value is the 24-bit binary string of the node.
# Notes with repeated binary strings are deleted. This correspond to Hamming distance 0 nodes.

NodeList={}
for x in txt:
    NodeList[int(x,2)]=x
NodeSet=set(NodeList.keys())
len(NodeSet)

198788

In [3]:
# GENERATES ALL NODES A HAMMING DISTANCE 1 AWAY

# For each node that is key in NodeList, all the nodes that are a Hamming distance 1 away are generated.
# The key and values are integers corresponding to the 24-bit binary strings.
# There are 24 nodes associated to each node in NodeList.

Distance1=dict.fromkeys(NodeList.keys(),set({}))
for x in Distance1.keys():
    Distance1[x]=set([int(NodeList[x][:i]+str(abs(1-int(NodeList[x][i])))+NodeList[x][i+1:],2) for i in range(24)])

In [4]:
# GENERATES ALL NODES A HAMMING DISTANCE 2 AWAY

# For each node that is key in NodeList, all the nodes that are a Hamming distance 2 away are generated.
# The key and values are integers corresponding to the 24-bit binary strings.
# There are 276 nodes associated to each node in NodeList.

Distance2=dict.fromkeys(NodeList.keys(),set({}))
for x in Distance2.keys():
    Distance2[x]=set([int(NodeList[x][:i]+str(abs(1-int(NodeList[x][i])))+NodeList[x][i+1:j]+str(abs(1-int(NodeList[x][j])))+NodeList[x][j+1:],2) for i in range(24) for j in range(24) if i<j])

In [6]:
# THE UNION-FIND

from networkx.utils.union_find import UnionFind

# Creates a UnionFind data structure where each leader is a node in NodeList.
UF=UnionFind(NodeSet)

# We now proceed to check which of the nodes in Distance1 and Distance2 are in the list of nodes of NodeList.
# If such nodes are in NodeList, they are then united in the UnionFind structure.
# This is done first for the Distance1 cases, and then for the Distance2 cases.

# The nodes that are a Hamming distance of 1 are fused in the UnionFind structure.
for i in Distance1.keys():
    for j in Distance1[i]:
        if j in NodeSet:
            UF.union(i,j)
            
# The nodes that are a Hamming distance of 1 are fused in the UnionFind structure.     
for i in Distance2.keys():
    for j in Distance2[i]:
        if j in NodeSet:
            UF.union(i,j)

# Returns the final answer: The number of cluster such that there is a Hamming distance of 3 or larger between nodes.
len(list(UF.to_sets()))

6118