In [89]:
import numpy as np
import scipy.sparse as sp
from scipy import io
import networkx as nx
import os
import csv

In [12]:
def normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx

In [59]:
label_map = {
    "Case_Based":0,
    "Genetic_Algorithms":1,
    "Neural_Networks":2,
    "Probabilistic_Methods":3,
    "Reinforcement_Learning":4,
    "Rule_Learning": 5,
    "Theory":6
}

In [70]:
def load_data(path="cora/", dataset="cora"):
    """Load citation network dataset (cora only for now)"""
    print('Loading {} dataset...'.format(dataset))

    idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset),
                                        dtype=np.dtype(str))
    
    features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
    labels = idx_features_labels[:, -1]

    # build graph
    idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset),
                                    dtype=np.int32)
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=np.int32).reshape(edges_unordered.shape)
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]),
                        dtype=np.float32)

    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

    features = normalize(features)
    #adj = normalize(adj + sp.eye(adj.shape[0]))
    labels_int = [label_map[f] for f in idx_features_labels[:, -1]]
    
    idx_train = range(1000)
    idx_val = None
    idx_test = range(1000, 1500)

    return adj, features, labels_int, idx_train, idx_val, idx_test

In [71]:
adj, features, labels, idx_train, idx_val, idx_test = load_data()

Loading cora dataset...


In [66]:
G = nx.from_scipy_sparse_matrix(adj)

In [100]:
max_len = 128

In [101]:
dataname = "CORA"
datadir  = "cora"

In [106]:
train_count, test_count = 0,0
train_tk, test_tk = [],[]
node_vocab = []

for idx in range(1500):
    G_sub = nx.ego_graph(G, idx, radius=3)
    node_ids = [labels[n] for n in G_sub.nodes]
    if len(node_ids) > max_len-1:
        continue
        
    # ADJ
    if idx in idx_train:
        fname = str(train_count)+'_'+dataname+"_adj.mtx"
    else:
        fname = str(test_count)+'_'+dataname+"_adj_val.mtx"
        
    G_u = G_sub.to_undirected()
    adj = nx.adj_matrix(G_u).todense()
    final = np.zeros((max_len,max_len), dtype=int)
    final[1:adj.shape[0]+1, 1:adj.shape[1]+1] = adj
    final += np.eye(max_len, dtype=int)
    final[:,0] = np.ones(max_len)
    final[0,:] = np.ones(max_len)

    m = sp.csr_matrix(final)
    sparsedir = os.path.join(datadir, 'adj')
    if not os.path.exists(sparsedir):
        os.makedirs(sparsedir)
    io.mmwrite(os.path.join(sparsedir, fname), m)
    
    if idx in idx_train:
        train_count+= 1
    else:
        test_count += 1
        
    for w in node_ids:
        if w not in node_vocab:
            node_vocab.append(w)
    node_ids.insert(0,'[CLS]')
    if idx in idx_train:
        train_tk.append(node_ids)
    else:
        test_tk.append(node_ids)


In [107]:
train_count, test_count

(541, 322)

In [104]:
with open(os.path.join(datadir, dataname+'-vocab.txt'), 'w') as f:       
    for i in range(np.max(node_vocab)+1):
        f.write(str(i)+'\n')
    f.write("[CLS]"+'\n')
    f.write("[MASK]"+'\n')

In [105]:
with open(os.path.join(datadir, dataname+'_tk.txt'), 'w') as f:
    w = csv.writer(f, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for tk in train_tk:
        w.writerow(tk)
        w.writerow([])
    

with open(os.path.join(datadir, dataname+'_tk_val.txt'), 'w') as f:
    w = csv.writer(f, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for tk in test_tk:
        w.writerow(tk)
        w.writerow([])