**Investigating Graph Node Embeddings**

Gian Favero | ECSE 556 | December 1st, 2023

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.stats import pearsonr, spearmanr
from helpers import *

First we load the .edge file associated with the “HumanNet Co-Expression of Human Genes (hn_HS_CX) network. The file will be cleaned up in such a way that only the relevant columns and values are kept.

In [2]:
edges_df = pd.read_csv('9606.hn_HS_CX.edge', sep='\t', header=None)
edges_df = edges_df.iloc[:, :3]
edges_df.columns = ['Node 1', 'Node 2', 'Weight']
print(edges_df.head())

            Node 1           Node 2    Weight
0  ENSG00000284589  ENSG00000276821  0.000008
1  ENSG00000284589  ENSG00000267534  0.000006
2  ENSG00000284589  ENSG00000178802  0.000006
3  ENSG00000284589  ENSG00000172772  0.000007
4  ENSG00000284589  ENSG00000167751  0.000008


Now we have to start forming the adjacency matrix that represents the network. We can get a set of every node in the network and then augment the dataset to ensure the network is undirected.

In [3]:
# Get set of all nodes
nodes = set(edges_df['Node 1'])
nodes = nodes.union(set(edges_df['Node 2']))

print('Number of nodes:', len(nodes))
print('Number of edges:', len(edges_df))

# Convert nodes to indices in edges_df
nodes_dict = dict(zip(nodes, range(len(nodes))))
edge_id_df = edges_df.copy()
edge_id_df['Node 1'] = edge_id_df['Node 1'].map(nodes_dict)
edge_id_df['Node 2'] = edge_id_df['Node 2'].map(nodes_dict)

# Initialize adjacency matrix
adj_mat = np.zeros((len(nodes), len(nodes)))

# Fill adjacency matrix
for i in tqdm(range(len(edge_id_df))):
    row = edge_id_df.iloc[i]
    adj_mat[int(row['Node 1']), int(row['Node 2'])] = row['Weight']

# If there are any self-loops, remove them
np.fill_diagonal(adj_mat, 0)

Number of nodes: 10938
Number of edges: 154387


 65%|██████▌   | 100555/154387 [00:08<00:04, 11875.11it/s]

In [None]:
# Adjust adjacency matrix to be symmetric, undirected
adj_mat = process_symmetric_entries(adj_mat)

# Find all subgraphs and get list of nodes that belong to subgraphs with less than 5 nodes
subgraphs = find_subgraphs(adj_mat)
nodes_remove = nodes_to_remove(subgraphs, 5)

# Remove nodes from adjacency matrix
adj_mat = np.delete(adj_mat, nodes_remove, axis=0)
adj_mat = np.delete(adj_mat, nodes_remove, axis=1)

# Remove nodes from nodes list
nodes = list(nodes)
nodes = [nodes[i] for i in range(len(nodes)) if i not in nodes_remove]
nodes_dict = dict(zip(nodes, range(len(nodes))))

# Remove rows from edges_df that contain nodes that were removed
edges_df = edges_df[edges_df['Node 1'].isin(nodes)]
edges_df = edges_df[edges_df['Node 2'].isin(nodes)]

edges_df['Node 1'] = edges_df['Node 1'].map(nodes_dict)
edges_df['Node 2'] = edges_df['Node 2'].map(nodes_dict)

# Normalize adjacency matrix by row
tr_mat = adj_mat / adj_mat.sum(axis=1, keepdims=True)

# Print size of network
print('Number of nodes: ', len(nodes))
print('Number of edges: ', len(edges_df))

# Choose 5000 random nodes to remove from network
nodes_remove = np.random.choice(range(len(nodes)), size=9500, replace=False)

# Remove nodes from adjacency matrix
adj_mat = np.delete(adj_mat, nodes_remove, axis=0)
adj_mat = np.delete(adj_mat, nodes_remove, axis=1)

Number of nodes:  10825
Number of edges:  154328


**Community Detection**

Using the same network, we can perform a community detection with various algorithms.

In [None]:
import networkx as nx
import community as community_louvain
from networkx.algorithms import community

import pickle

# Define graph
G = nx.from_numpy_matrix(adj_mat)

# Apply Clauset-Newman-Moore algorithm
clauset_newman_communities = list(community.greedy_modularity_communities(G))
clauset_newman_clusters = {f'Community_{i+1}': list(community_set) for i, community_set in enumerate(clauset_newman_communities)}

# Apply Louvain algorithm
partition = community_louvain.best_partition(G)
louvain_clusters = {f'Community_{i+1}': [node for node, community_id in partition.items() if community_id == i] for i in set(partition.values())}

# Apply Girvan-Newman algorithm
girvan_newman_communities = next(community.girvan_newman(G))
girvan_newman_clusters = {f'Community_{i+1}': list(community_set) for i, community_set in enumerate(girvan_newman_communities)}

# Save the clusters to a dictionary
all_clusters = {
    'Clauset_Newman_Moore': clauset_newman_clusters,
    'Louvain': louvain_clusters,
    'Girvan_Newman': girvan_newman_clusters
}

# Save the dictionary to a pickle file
with open('community_clusters.pkl', 'wb') as file:
    pickle.dump(all_clusters, file)

print("Community clusters saved to 'community_clusters.pkl'")

AttributeError: module 'community' has no attribute 'best_partition'

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import itertools
import pickle

# Load the clusters from the pickle file
with open('Communities/community_clusters.pkl', 'rb') as file:
    all_clusters = pickle.load(file)

'''greedy_clusters = all_clusters['Clauset_Newman_Moore']
greedy_color_map = {node: i for i, cluster in enumerate(greedy_clusters) for node in cluster}'''

louvain_clusters = all_clusters['Louvain']
louvain_color_map = {node: i for i, cluster in enumerate(louvain_clusters) for node in cluster}

girvan_clusters = all_clusters['Girvan_Newman']
girvan_color_map = {node: i for i, cluster in enumerate(girvan_clusters) for node in cluster}

# Draw the graph with different colors for each cluster
pos = nx.spring_layout(G)  # You can use different layout algorithms
plt.figure(figsize=(12, 8))

# Girvan-Newman
plt.subplot(131)
nx.draw(G, pos, node_color=[girvan_color_map[node] for node in G.nodes], with_labels=True, cmap='viridis')
plt.title('Girvan-Newman')

# Greedy Modularity
plt.subplot(132)
nx.draw(G, pos, node_color=[greedy_color_map[node] for node in G.nodes], with_labels=True, cmap='viridis')
plt.title('Greedy Modularity')

# Louvain
plt.subplot(133)
nx.draw(G, pos, node_color=[louvain_color_map[node] for node in G.nodes], with_labels=True, cmap='viridis')
plt.title('Louvain')

plt.tight_layout()
plt.show()