In [1]:
import networkx as nx
import scipy.sparse as sp
import numpy as np
import math
import matplotlib.pyplot as plt

In [2]:
raw_edge_list = open('yeast.edgelist','r')
G = nx.Graph()
G = nx.parse_edgelist(raw_edge_list, delimiter='\t', create_using=G,nodetype=str, data=(('weight', float),))
# Get graph edges and nodes from networkx graph object
nodes = G.nodes
edges = G.edges

print("Graph's Nodes : {} / Edges : {}".format(len(nodes), len(edges)))

Graph's Nodes : 6526 / Edges : 532180


In [3]:
#Graph Visualization

def visualize_graph(file_name, graph):
    title = file_name
    plt.figure(figsize=(20,20))
    node_size = 15
    node_alpha = .6

    edge_color = '#483D8B'
    edge_alpha = 0.15
    edge_style = 'dashed'
    edge_width = 2

    protein_node_color = "#F08080"
    rna_node_color = "#1E90FF"

    pos = nx.drawing.nx_agraph.graphviz_layout(graph)
    nx.draw_networkx_nodes(graph, pos, node_size=node_size, node_color=rna_node_color, alpha=node_alpha)
    nx.draw_networkx_edges(graph, pos, width=edge_width, alpha=edge_alpha)
    plt.axis('off')
    plt.savefig("images/{}.png".format(title))
    plt.title("{} / Blue - Protein".format(title))
    plt.show()

In [4]:
adj = nx.adjacency_matrix(G)
adj.todense()

matrix([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [6]:
#Remove self-edges
adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape)
adj.eliminate_zeros()
adj.todense()

matrix([[0, 1, 1, ..., 0, 0, 0],
        [1, 0, 1, ..., 0, 0, 0],
        [1, 1, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [7]:
#Number of edges required in train, test and validation set
num_edges = len(edges)
num_val_edges = math.floor(num_edges * 0.05)
num_test_edges = math.floor(num_edges * 0.25)
num_train_edges = num_edges - (num_val_edges + num_test_edges)

Following two methods are taken from 
http://snap.stanford.edu/deepnetbio-ismb/ipynb/Graph+Convolutional+Prediction+of+Protein+Interactions+in+Yeast.html for partitioning the datasets. Although, I have followed another approach as well. 


In [8]:
def sparse_to_tuple(sparse_mx):
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

In [9]:
def mask_test_edges(adj):
    # Function to build test set with 2% positive links
    # Remove diagonal elements
    adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape)
    adj.eliminate_zeros()

    adj_triu = sp.triu(adj)
    adj_tuple = sparse_to_tuple(adj_triu)
    edges = adj_tuple[0]
    edges_all = sparse_to_tuple(adj)[0]
    num_test = int(np.floor(edges.shape[0] / 50.))
    num_val = int(np.floor(edges.shape[0] / 50.))

    all_edge_idx = list(range(edges.shape[0]))
    np.random.shuffle(all_edge_idx)
    val_edge_idx = all_edge_idx[:num_val]
    test_edge_idx = all_edge_idx[num_val:(num_val + num_test)]
    test_edges = edges[test_edge_idx]
    val_edges = edges[val_edge_idx]
    train_edges = np.delete(edges, np.hstack([test_edge_idx, val_edge_idx]), axis=0)

    def ismember(a, b):
        rows_close = np.all((a - b[:, None]) == 0, axis=-1)
        return np.any(rows_close)

    test_edges_false = []
    while len(test_edges_false) < len(test_edges):
        n_rnd = len(test_edges) - len(test_edges_false)
        rnd = np.random.randint(0, adj.shape[0], size=2 * n_rnd)
        idxs_i = rnd[:n_rnd]                                        
        idxs_j = rnd[n_rnd:]
        for i in range(n_rnd):
            idx_i = idxs_i[i]
            idx_j = idxs_j[i]
            if idx_i == idx_j:
                continue
            if ismember([idx_i, idx_j], edges_all):
                continue
            if test_edges_false:
                if ismember([idx_j, idx_i], np.array(test_edges_false)):
                    continue
                if ismember([idx_i, idx_j], np.array(test_edges_false)):
                    continue
            test_edges_false.append([idx_i, idx_j])

    val_edges_false = []
    while len(val_edges_false) < len(val_edges):
        n_rnd = len(val_edges) - len(val_edges_false)
        rnd = np.random.randint(0, adj.shape[0], size=2 * n_rnd)
        idxs_i = rnd[:n_rnd]                                        
        idxs_j = rnd[n_rnd:]
        for i in range(n_rnd):
            idx_i = idxs_i[i]
            idx_j = idxs_j[i]
            if idx_i == idx_j:
                continue
            if ismember([idx_i, idx_j], train_edges):
                continue
            if ismember([idx_j, idx_i], train_edges):
                continue
            if ismember([idx_i, idx_j], val_edges):
                continue
            if ismember([idx_j, idx_i], val_edges):
                continue
            if val_edges_false:
                if ismember([idx_j, idx_i], np.array(val_edges_false)):
                    continue
                if ismember([idx_i, idx_j], np.array(val_edges_false)):
                    continue
            val_edges_false.append([idx_i, idx_j])

    # Re-build adj matrix
    data = np.ones(train_edges.shape[0])
    adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape)
    adj_train = adj_train + adj_train.T

    return adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false

In [None]:
adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)

In [10]:
gf = nx.from_scipy_sparse_matrix(adj_train)

Following cells are also doing partitioning of the dataset but by dividing the edges array rather picking them up from adjacency matrix. I will train GCN model for both approaches and will see which one would be the best split.

In [None]:
#Prepare train graph
random_train_edge_ind = np.random.choice(arr_edges.shape[0],num_train_edges,replace=False)
train_arr = arr_edges[random_train_edge_ind,:]
g_train = nx.Graph()
g_train.add_edges_from(train_arr)
visualize_graph("",g_train)

In [None]:
#Prepare test graph
random_test_edge_ind = np.random.choice(arr_edges.shape[0],num_test_edges,replace=False)
test_arr = arr_edges[random_test_edge_ind,:]
g_test = nx.Graph()
g_test.add_edges_from(test_arr)
visualize_graph("",g_test)

In [None]:
#prepare validation graph
random_val_edge_ind = np.random.choice(arr_edges.shape[0],num_val_edges,replace=False)
val_arr = arr_edges[random_val_edge_ind,:]
g_val = nx.Graph()
g_val.add_edges_from(val_arr)
visualize_graph("",g_val)

In [None]:
plt.imshow(adj, cmap='hot', interpolation='nearest')
plt.show()
