In [4]:
import networkx as nx
import numpy as np
import scipy as sc
import os
import re
import csv
from scipy import sparse, io

In [5]:

def read_graphfile(datadir, dataname, max_nodes=None):
    ''' Read data from https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets
        graph index starts with 1 in file
    Returns:
        List of networkx objects with graph and node labels
    '''
    prefix = os.path.join(datadir, dataname, dataname)
    filename_graph_indic = prefix + '_graph_indicator.txt'
    # index of graphs that a given node belongs to
    graph_indic={}
    with open(filename_graph_indic) as f:
        i=1
        for line in f:
            line=line.strip("\n")
            graph_indic[i]=int(line)
            i+=1

    filename_nodes=prefix + '_node_labels.txt'
    node_labels=[]
    min_label_val = None
    try:
        with open(filename_nodes) as f:
            has_zero = False
            for line in f:
                line=line.strip("\n")
                l = int(line)
                node_labels+=[l]
                if min_label_val is None or min_label_val > l:
                    min_label_val = l
        # assume that node labels are consecutive
        num_unique_node_labels = max(node_labels) - min_label_val + 1
        node_labels = [l - min_label_val for l in node_labels]
    except IOError:
        print('No node labels')
 
    filename_node_attrs=prefix + '_node_attributes.txt'
    node_attrs=[]
    try:
        with open(filename_node_attrs) as f:
            for line in f:
                line = line.strip("\s\n")
                attrs = [float(attr) for attr in re.split("[,\s]+", line) if not attr == '']
                node_attrs.append(np.array(attrs))
    except IOError:
        print('No node attributes')
       
    label_has_zero = False
    filename_graphs=prefix + '_graph_labels.txt'
    graph_labels=[]

    label_vals = []
    with open(filename_graphs) as f:
        for line in f:
            line=line.strip("\n")
            val = int(line)
            if val not in label_vals:
                label_vals.append(val)
            graph_labels.append(val)

    label_map_to_int = {val:i for i, val in enumerate(label_vals)}
    graph_labels = np.array([label_map_to_int[l] for l in graph_labels])
    
    filename_adj=prefix + '_A.txt'
    adj_list={i:[] for i in range(1,len(graph_labels)+1)}    
    index_graph={i:[] for i in range(1,len(graph_labels)+1)}
    num_edges = 0
    with open(filename_adj) as f:
        for line in f:
            line=line.strip("\n").split(",")
            e0,e1=(int(line[0].strip(" ")),int(line[1].strip(" ")))
            adj_list[graph_indic[e0]].append((e0,e1))
            index_graph[graph_indic[e0]]+=[e0,e1]
            num_edges += 1
    for k in index_graph.keys():
        index_graph[k]=[u-1 for u in set(index_graph[k])]

    graphs=[]
    for i in range(1,1+len(adj_list)):
        # indexed from 1 here
        G=nx.from_edgelist(adj_list[i])
        if max_nodes is not None and G.number_of_nodes() > max_nodes:
            continue
      
        # add features and labels
        G.graph['label'] = graph_labels[i-1]
        for u in G.nodes():
            if len(node_labels) > 0:
                #node_label_one_hot = [0] * num_unique_node_labels
                node_label = node_labels[u-1]
                #node_label_one_hot[node_label] = 1
                G.node[u]['label'] = node_label
            if len(node_attrs) > 0:
                G.node[u]['feat'] = node_attrs[u-1]
        if len(node_attrs) > 0:
            G.graph['feat_dim'] = node_attrs[0].shape[0]

        # relabeling
        mapping={}
        it=0
        if float(nx.__version__)<2.0:
            for n in G.nodes():
                mapping[n]=it
                it+=1
        else:
            for n in G.nodes:
                mapping[n]=it
                it+=1
            
        # indexed from 0
        graphs.append(nx.relabel_nodes(G, mapping))
    return graphs, num_unique_node_labels
    #return graphs

In [6]:
datadir = ""
dataname = "MSRC_9"
Gs, nb_unique_node_labels = read_graphfile(datadir, dataname, max_nodes=None)
#Gs = read_graphfile(datadir, dataname, max_nodes=None)

No node attributes


In [7]:
Gs = [G for G in Gs if len(G.nodes) < 128]
Gs = [G for G in Gs if len(G.nodes) > 5]

In [8]:
len(Gs)

221

In [9]:
Gs[0].nodes(data=True), Gs[0].edges, Gs[0].graph['label']

(NodeDataView({0: {'label': 2}, 1: {'label': 2}, 2: {'label': 2}, 3: {'label': 2}, 4: {'label': 2}, 5: {'label': 2}, 6: {'label': 2}, 7: {'label': 2}, 8: {'label': 2}, 9: {'label': 2}, 10: {'label': 2}, 11: {'label': 2}, 12: {'label': 2}, 13: {'label': 2}, 14: {'label': 4}, 15: {'label': 2}, 16: {'label': 2}, 17: {'label': 2}, 18: {'label': 4}, 19: {'label': 2}, 20: {'label': 2}, 21: {'label': 2}, 22: {'label': 2}, 23: {'label': 2}, 24: {'label': 4}, 25: {'label': 2}, 26: {'label': 2}, 27: {'label': 2}, 28: {'label': 2}, 29: {'label': 2}, 30: {'label': 2}, 31: {'label': 2}, 32: {'label': 2}, 33: {'label': 2}, 34: {'label': 2}, 35: {'label': 2}, 36: {'label': 2}, 37: {'label': 2}, 38: {'label': 2}, 39: {'label': 2}, 40: {'label': 2}, 41: {'label': 2}, 42: {'label': 2}, 43: {'label': 2}, 44: {'label': 2}, 45: {'label': 2}, 46: {'label': 2}}),
 EdgeView([(0, 3), (0, 5), (1, 9), (1, 2), (1, 4), (1, 6), (2, 3), (2, 5), (2, 6), (2, 7), (2, 12), (2, 14), (3, 5), (4, 9), (5, 7), (5, 10), (5, 1

In [10]:
np.where(np.asarray([len(G.nodes) for G in Gs]) == 0)

(array([], dtype=int64),)

In [11]:
nb_unique_node_labels

10

In [12]:
c = {}
for G in Gs:
    if c.get(G.graph['label'], None) == None:
        c[G.graph['label']] = 1
    else:
        c[G.graph['label']] += 1

In [13]:
c

{0: 23, 1: 30, 2: 30, 3: 29, 4: 30, 5: 30, 6: 19, 7: 30}

In [14]:
idx = range(len(Gs))
shuffled = np.random.permutation(idx)
pivot = int(len(Gs)*0.2)
test_idx = shuffled[:pivot]
train_idx = shuffled[pivot:]

In [15]:
# Labels
train_labels, test_labels = [],[]
for idx,G in enumerate(Gs):
    l = G.graph['label']
    if idx in train_idx:
        train_labels.append(l)
    else:
        test_labels.append(l)
    
with open(os.path.join(datadir, dataname+'_label.txt'), 'w') as f:
    for l in train_labels:
        f.write(str(l)+'\n')

with open(os.path.join(datadir, dataname+'_label_val.txt'), 'w') as f:
    for l in test_labels:
        f.write(str(l)+'\n')

In [27]:
# TK
node_labels = True
train_tk, test_tk = [],[]
node_vocab = []
for idx,G in enumerate(Gs):
    if node_labels:
        l = [n[1]['label'] for n in G.nodes(data=True)]
    else:
        l = [int(n[1]['feat'][0]) for n in G.nodes(data=True)]
        #l = [1 for _ in G.nodes]
    for w in l:
        if w not in node_vocab:
            node_vocab.append(w)
    l.insert(0,'[CLS]')
    if idx in train_idx:
        train_tk.append(l)
    else:
        test_tk.append(l)

with open(os.path.join(datadir, dataname+'_tk.txt'), 'w') as f:
    w = csv.writer(f, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for tk in train_tk:
        w.writerow(tk)
        w.writerow([])
    

with open(os.path.join(datadir, dataname+'_tk_val.txt'), 'w') as f:
    w = csv.writer(f, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for tk in test_tk:
        w.writerow(tk)
        w.writerow([])

In [17]:
# ADJ

In [28]:
max_len = 128

In [29]:
train_count, test_count = 0,0
for idx,G in enumerate(Gs):
    if idx in train_idx:
        fname = str(train_count)+'_'+dataname+"_adj.mtx"
    else:
        fname = str(test_count)+'_'+dataname+"_adj_val.mtx"
        
    G_u = G.to_undirected()
    adj = nx.adj_matrix(G_u).todense()
    final = np.zeros((max_len,max_len), dtype=int)
    final[1:adj.shape[0]+1, 1:adj.shape[1]+1] = adj
    final += np.eye(max_len, dtype=int)
    final[:,0] = np.ones(max_len)
    final[0,:] = np.ones(max_len)

    m = sparse.csr_matrix(final)
    sparsedir = os.path.join(datadir, 'adj')
    if not os.path.exists(sparsedir):
        os.makedirs(sparsedir)
    io.mmwrite(os.path.join(sparsedir, fname), m)
    
    if idx in train_idx:
        train_count+= 1
    else:
        test_count += 1

In [30]:
train_count,len(train_idx), test_count,len(test_idx) 

(177, 177, 44, 44)

In [21]:
node_vocab

[2, 4, 7, 0, 5, 1, 6, 8, 9, 3]

In [22]:
# VOCAB

In [23]:
with open(os.path.join(datadir, dataname+'-vocab.txt'), 'w') as f:
    if node_labels is False:
        nb_unique_node_labels = np.max(node_vocab)+1
        
    for i in range(nb_unique_node_labels):
        f.write(str(i)+'\n')
    f.write("[CLS]"+'\n')
    f.write("[MASK]"+'\n')

In [24]:
# VOCAB LABEL

In [25]:
c.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7])

In [26]:
with open(os.path.join(datadir, dataname+'-vocab_label.txt'), 'w') as f:
    for i in c.keys():
        f.write(str(i)+'\n')