In [1]:
import numpy as np
import scipy.sparse as sp

In [2]:
dataset="cora"
path="../data/cora/"
idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset), dtype=np.dtype(str))

In [3]:
features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
# onehot_labels = encode_onehot(idx_features_labels[:, -1])
# features.shape

In [4]:
labels = idx_features_labels[:,-1]
classes = set(labels)
# len(labels)

In [5]:
class_dict = {c:np.identity(len(classes))[i,:] for i,c in enumerate(classes)}

In [6]:
labels_onehot = np.array(list(map(class_dict.get,labels)),dtype=np.int32)

In [7]:
idx=np.array(idx_features_labels[:,0],dtype=np.int32)

In [8]:
def encode_onehot(labels):
    classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)), dtype=np.int32)
    return labels_onehot

In [9]:
idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset), dtype=np.dtype(str))
features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
onehot_labels = encode_onehot(idx_features_labels[:, -1])

# build graph
idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
idx_map = {j: i for i, j in enumerate(idx)}
edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset), dtype=np.int32)
edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                 dtype=np.int32).reshape(edges_unordered.shape)
# adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
#                     shape=(onehot_labels.shape[0], onehot_labels.shape[0]), dtype=np.float32)

# build symmetric adjacency matrix
# adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
# adj = convert_symmetric(adj, )

In [10]:
onehot_labels = labels_onehot

In [11]:
adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(onehot_labels.shape[0], onehot_labels.shape[0]), dtype=np.float32)

In [12]:
def convert_symmetric(X, sparse=True):
    if sparse:
        X += X.T - sp.diags(X.diagonal())
    else:
        X += X.T - np.diag(X.diagonal())
    return X

In [13]:
adj = convert_symmetric(adj, )

In [14]:
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import scipy.sparse as sp
from sklearn.manifold import TSNE

import os
import pickle as pkl
import sys


def get_splits(y,):
    idx_list = np.arange(len(y))
    # train_val, idx_test = train_test_split(idx_list, test_size=0.2, random_state=1024)  # 1000
    # idx_train, idx_val = train_test_split(train_val, test_size=0.2, random_state=1024)  # 500

    idx_train = []
    label_count = {}
    for i, label in enumerate(y):
        label = np.argmax(label)
        if label_count.get(label, 0) < 20:
            idx_train.append(i)
            label_count[label] = label_count.get(label, 0) + 1

    idx_val_test = list(set(idx_list) - set(idx_train))
#     print('idx_list',len(idx_list))
#     print('idx_train',len(idx_train))
#     print('idx_val_test',len(idx_val_test))
    idx_val = idx_val_test[0:500]
    idx_test = idx_val_test[500:1500]


    y_train = np.zeros(y.shape, dtype=np.int32)
    y_val = np.zeros(y.shape, dtype=np.int32)
    y_test = np.zeros(y.shape, dtype=np.int32)
    y_train[idx_train] = y[idx_train]
    print(y_train.sum())
    y_val[idx_val] = y[idx_val]
    y_test[idx_test] = y[idx_test]
    train_mask = sample_mask(idx_train, y.shape[0])
    val_mask = sample_mask(idx_val, y.shape[0])
    test_mask = sample_mask(idx_test, y.shape[0])

    return y_train, y_val, y_test,train_mask, val_mask, test_mask


def load_data_v1(dataset="cora", path="../data/cora/",):

    idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset), dtype=np.dtype(str))
    features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
    onehot_labels = encode_onehot(idx_features_labels[:, -1])

    # build graph
    idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset), dtype=np.int32)
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=np.int32).reshape(edges_unordered.shape)
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(onehot_labels.shape[0], onehot_labels.shape[0]), dtype=np.float32)

    # build symmetric adjacency matrix
    # adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
    adj = convert_symmetric(adj, )

    print('Dataset has {} nodes, {} edges, {} features.'.format(adj.shape[0], edges.shape[0], features.shape[1]))

    y_train, y_val, y_test, train_mask, val_mask, test_mask = get_splits(onehot_labels)

    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask



def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index

def load_data(dataset_str):
    """Load data."""
    FILE_PATH = os.path.abspath(__file__)
    DIR_PATH = os.path.dirname(FILE_PATH)
    DATA_PATH = os.path.join(DIR_PATH, 'data/')
    DATA_PATH = "../data/cora/"

    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("{}ind.{}.{}".format(DATA_PATH, dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_idx_reorder = parse_index_file("{}ind.{}.test.index".format(DATA_PATH, dataset_str))
    test_idx_range = np.sort(test_idx_reorder)

    if dataset_str == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder),
                                    max(test_idx_reorder) + 1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range - min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range - min(test_idx_range), :] = ty
        ty = ty_extended

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]

    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y) + 500)

    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
    y_train[train_mask, :] = labels[train_mask, :]
    y_val[val_mask, :] = labels[val_mask, :]
    y_test[test_mask, :] = labels[test_mask, :]


    return sp.csr_matrix(adj), features, y_train, y_val, y_test, train_mask, val_mask, test_mask



def sample_mask(idx, l):
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)


def convert_symmetric(X, sparse=True):
    if sparse:
        X += X.T - sp.diags(X.diagonal())
    else:
        X += X.T - np.diag(X.diagonal())
    return X


def encode_onehot(labels):
    classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)), dtype=np.int32)
    return labels_onehot


def normalize_adj(adj, symmetric=True):
    if symmetric:
        d = sp.diags(np.power(np.array(adj.sum(1)), -0.5).flatten(), 0)
        a_norm = adj.dot(d).transpose().dot(d).tocsr()
    else:
        d = sp.diags(np.power(np.array(adj.sum(1)), -1).flatten(), 0)
        a_norm = d.dot(adj).tocsr()
    return a_norm


def preprocess_adj(adj, symmetric=True):
    adj = adj + sp.eye(adj.shape[0])
    adj = normalize_adj(adj, symmetric)
    return adj


def plot_embeddings(embeddings, X, Y):

    emb_list = []
    for k in X:
        emb_list.append(embeddings[k])
    emb_list = np.array(emb_list)

    model = TSNE(n_components=2)
    node_pos = model.fit_transform(emb_list)

    color_idx = {}
    for i in range(len(X)):
        color_idx.setdefault(Y[i][:], [])
        color_idx[Y[i][:]].append(i)

    for c, idx in color_idx.items():
        plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
    plt.legend()
    plt.show()


def preprocess_features(features):
    """Row-normalize feature matrix and convert to tuple representation"""
    rowsum = np.array(features.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    features = r_mat_inv.dot(features)
    return features.todense()


In [15]:
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data_v1()

Dataset has 2708 nodes, 5429 edges, 1433 features.
140


In [16]:
# y_train
features
# adj

<2708x1433 sparse matrix of type '<class 'numpy.float32'>'
	with 3880564 stored elements in Compressed Sparse Row format>

In [17]:
# features

In [18]:
len(y_train),len(y_test),len(train_mask),len(val_mask),len(test_mask)

(2708, 2708, 2708, 2708, 2708)

In [19]:
features.shape

(2708, 1433)

In [20]:
features /= features.sum(axis=1).reshape(-1,1)

In [21]:
# features.shape

A = adj
A.shape

(2708, 2708)

In [22]:
import networkx as nx

In [23]:
G = nx.from_scipy_sparse_matrix(adj,create_using=nx.DiGraph())

In [24]:
A = preprocess_adj(A)

In [25]:
A.shape

(2708, 2708)

In [26]:
def sample_neighs(G, nodes, sample_num=None, self_loop=False, shuffle=True):  # Sampling neighbor nodes
    _sample = np.random.choice
    neighs = [list(G[int(node)]) for node in nodes]  # The neighbors of each node in nodes
    if sample_num:
        if self_loop:
            sample_num -= 1

        samp_neighs = [
            list(_sample(neigh, sample_num, replace=False)) if len(neigh) >= sample_num else list(
                _sample(neigh, sample_num, replace=True)) for neigh in neighs]  # Sample neighbors
        if self_loop:
            samp_neighs = [
                samp_neigh + list([nodes[i]]) for i, samp_neigh in enumerate(samp_neighs)]  # gcn neighbors have to add themselves

        if shuffle:
            samp_neighs = [list(np.random.permutation(x)) for x in samp_neighs]
    else:
        samp_neighs = neighs
    return np.asarray(samp_neighs), np.asarray(list(map(len, samp_neighs)))

In [27]:
indexs = np.arange(A.shape[0])
neigh_number = [10, 25]
neigh_maxlen = []

model_input = [features, np.asarray(indexs, dtype=np.int32)]
for num in neigh_number:
    sample_neigh, sample_neigh_len = sample_neighs(
        G, indexs, num, self_loop=False)
    model_input.extend([sample_neigh])
    neigh_maxlen.append(max(sample_neigh_len))
neigh_maxlen

[10, 25]

In [28]:
model_input[0].shape,model_input[1].shape,model_input[2].shape

((2708, 1433), (2708,), (2708, 10))

In [29]:
neigh_maxlen

[10, 25]

In [30]:
from tensorflow.keras.initializers import glorot_uniform,zeros
from tensorflow.keras.layers import Input,Dense,Dropout,Layer,LSTM
from tensorflow.keras.regularizers import l2

In [31]:
A = adj

In [32]:
indexes = np.arange(A.shape[0])

In [33]:
features/=features.sum(axis=1,).reshape(-1,1)

In [34]:
features

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [35]:
features.shape

(2708, 1433)

In [36]:
neigh_maxlen

[10, 25]

In [37]:
model_input[0]

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [38]:
model_input[1]

array([   0,    1,    2, ..., 2705, 2706, 2707])

In [39]:
model_input[2]

array([[ 435,   14,   14, ...,   14,  544,   14],
       [ 344,  344,  344, ...,  344,  344,  344],
       [ 565,  471,  471, ...,  565,  410,  552],
       ...,
       [2216, 1784, 2216, ..., 1839, 2216, 1839],
       [1752, 1752, 1640, ..., 1752, 1138, 1752],
       [ 774,  774, 1389, ..., 1389, 2344,  774]], dtype=int32)

In [40]:
model_input[3]

array([[ 544,  435,  435, ...,    8,  544,  544],
       [ 344,  344,  344, ...,  344,  344,  344],
       [ 565,  471,  552, ...,  410,  410,  565],
       ...,
       [1784, 2216, 1840, ..., 1784, 1840, 1840],
       [1752, 1138, 1046, ..., 1752, 1640, 1138],
       [2344, 2344,  774, ..., 2344, 1389, 2344]], dtype=int32)

In [41]:
model_input[0].shape

(2708, 1433)

In [42]:
A.shape

(2708, 2708)

In [43]:
import tensorflow as tf

In [44]:
from tensorflow.keras.layers import Embedding

In [45]:
emb_layer = Embedding(A.shape[0],12)

In [46]:
model_input_1 = emb_layer(model_input[0])

In [47]:
type(model_input)

list

In [48]:
type(model_input[0]),type(model_input[1]),type(model_input[2])

(numpy.matrix, numpy.ndarray, numpy.ndarray)

In [49]:
model_input[1]

array([   0,    1,    2, ..., 2705, 2706, 2707])

In [50]:
model_input

[matrix([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]], dtype=float32),
 array([   0,    1,    2, ..., 2705, 2706, 2707]),
 array([[ 435,   14,   14, ...,   14,  544,   14],
        [ 344,  344,  344, ...,  344,  344,  344],
        [ 565,  471,  471, ...,  565,  410,  552],
        ...,
        [2216, 1784, 2216, ..., 1839, 2216, 1839],
        [1752, 1752, 1640, ..., 1752, 1138, 1752],
        [ 774,  774, 1389, ..., 1389, 2344,  774]], dtype=int32),
 array([[ 544,  435,  435, ...,    8,  544,  544],
        [ 344,  344,  344, ...,  344,  344,  344],
        [ 565,  471,  552, ...,  410,  410,  565],
        ...,
        [1784, 2216, 1840, ..., 1784, 1840, 1840],
        [1752, 1138, 1046, ..., 1752, 1640, 1138],
        [2344, 2344,  774, ..., 2344, 1389, 2344]], dtype=int32)]

In [51]:
a=np.arange(30)
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

In [52]:
np.asarray(a,dtype=np.int32)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

In [53]:
from tensorflow.keras.models import Model

In [55]:
type(y_train)

numpy.ndarray

In [56]:
!sudo apt install nvidia-cuda-toolkit


'sudo' is not recognized as an internal or external command,
operable program or batch file.
