### this notebook is personal learning based on the graph prediction (custom_dataset.py)tutorial from [Spektral](https://github.com/danielegrattarola/spektral/blob/master/examples/graph_prediction/custom_dataset.py)

In [None]:
"""
my major objective is to learn how to prepare my graph dataset and load them for GNN
"""

In [None]:
"""
This example shows how to define your own dataset and use it to train a
non-trivial GNN with message-passing and pooling layers.
The script also shows how to implement fast training and evaluation functions
in disjoint mode, with early stopping and accuracy monitoring.

The dataset that we create is a simple synthetic task in which we have random
graphs with randomly-colored nodes. The goal is to classify each graph with the
color that occurs the most on its nodes. For example, given a graph with 2
colors and 3 nodes:

x = [[1, 0],
     [1, 0],
     [0, 1]],

the corresponding target will be [1, 0].
"""

In [9]:
!pip install spektral

In [10]:
import numpy as np
import scipy.sparse as sp
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import categorical_accuracy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

from spektral.data import Dataset, DisjointLoader, Graph
from spektral.layers import GCSConv, GlobalAvgPool,spektral.layers.GCNConv
from spektral.transforms.normalize_adj import NormalizeAdj

In [24]:

################################################################################
# Config
################################################################################
learning_rate = 1e-2  # Learning rate
epochs = 400  # Number of training epochs
es_patience = 10  # Patience for early stopping
batch_size = 32  # Batch size


##### create custom dataset

In [11]:
################################################################################
# Load data
################################################################################
class MyDataset(Dataset):
    """
    A dataset of random colored graphs.
    The task is to classify each graph with the color which occurs the most in
    its nodes.
    The graphs have `n_colors` colors, of at least `n_min` and at most `n_max`
    nodes connected with probability `p`.

    Basically, regarding to my dataset, i already know the edge and node features,
      therefore, n_min, n_max, p are not needed in my case.
    """

    def __init__(self, n_samples, n_colors=3, n_min=10, n_max=100, p=0.1, **kwargs):
        self.n_samples = n_samples
        self.n_colors = n_colors
        self.n_min = n_min
        self.n_max = n_max
        self.p = p
        # notice: this line can not be removed the above 'Dataset' class is a specific class
        super().__init__(**kwargs)

    def read(self):
        def make_graph():
            # create a random node size 
            n = np.random.randint(self.n_min, self.n_max)
            colors = np.random.randint(0, self.n_colors, size=n)
            # assign node features 
            # Node features
            x = np.zeros((n, self.n_colors)) # node feature dimension is n_color= 3
            # np.arange(n) is generate a np array, from 0 to n
            x[np.arange(n), colors] = 1 # set the color for each node (each node have  n_color= 3 dimension), which dimension have the color
            
            # Edges
            # creat a symetric matrix with random values uniformly distributed in the range [0, 1)
            a = np.random.rand(n, n) <= self.p # using p to create a boolean array 布尔数组, where each element is True or False 
            # adjacent matrix is a symetric matrix (value is also symetrix), maximum (a,a.T) is to make the value equal along with the diagonal 
            # astype(int) transform the boolean array True and False to 1 and 0 
            a = np.maximum(a, a.T).astype(int)
            # transform the ajdacency matrix as the csr_matrix
            # csr fromat is designed for the highly sparse matrix  
            # it saves memory and computation time by only storing the non-zero values and their indices.
            a = sp.csr_matrix(a) # Compressed Sparse Row (CSR) format

            # Labels
            y = np.zeros((self.n_colors,)) # create a array for one-hot encoding 
            color_counts = x.sum(0) # sum the node feature matrix as the column, each column each sum and find the largest for label
            y[np.argmax(color_counts)] = 1 # assign the label to the onehot encoding

            return Graph(x=x, a=a, y=y) # create the graph

        # We must return a list of Graph objects
        return [make_graph() for _ in range(self.n_samples)] 
        # the function is to build a dataset, which means hundreds of graph will be made with it
        # the n_samples is the sample graph we want to make,
        # the for loop is to repeat the n_sample times and each time genarate the graph and put it inside a [] list 

##### detailed test for some code above

In [None]:
np.arange(n) # create a array from 0 to n, with evenly space
# variation 1 , np.arange(1,5), create array from 1 to 5
# variation 2 , np.arange(1,10,2)

In [None]:
a= np.random.rand(n, n) <= 0.1
a


In [None]:
a= np.maximum(a, a.T).astype(int)
a

In [42]:
sp.csr_matrix(a)

<35x35 sparse matrix of type '<class 'numpy.int64'>'
	with 218 stored elements in Compressed Sparse Row format>

In [8]:
x=np.array([[2,3],[1,1]])
x.sum(0)

array([3, 4])

In [7]:
y = np.zeros((3,))
y
color_counts = x.sum(0)
color_counts
y[np.argmax(color_counts)] = 1
y

array([0., 1., 0.])

#### load the designed dataset 

In [18]:
# call the MyDataset class for the graph generation. 
# NormalizeAdj() is 
data = MyDataset(1000,transforms=NormalizeAdj())

In [22]:
data[0]

Graph(n_nodes=82, n_node_features=3, n_edge_features=None, n_labels=3)

In [None]:
data1=  MyDataset(1000)
normalized_dataset = data[1].apply_transform(NormalizeAdj())

In [17]:
"""
the variable data is like a list, if you want to check or retrieve any graph from it, just do it like it is a list
Here, we can do different approaches, using an array to retrived graphs from it. 

we do not need the NormalizeAdj() function, if our dataset is not binary adjacency matrix
"""

Graph(n_nodes=48, n_node_features=3, n_edge_features=None, n_labels=3)

In [25]:
# Train/valid/test split
idxs = np.random.permutation(len(data)) 
split_va, split_te = int(0.8 * len(data)), int(0.9 * len(data))
idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te])
data_tr = data[idx_tr]
data_va = data[idx_va]
data_te = data[idx_te]

# Data loaders
loader_tr = DisjointLoader(data_tr, batch_size=batch_size, epochs=epochs)
loader_va = DisjointLoader(data_va, batch_size=batch_size)
loader_te = DisjointLoader(data_te, batch_size=batch_size)


In [None]:
from spektral.layers import GCSConv, GlobalAvgPool,spektral.layers.GCNConv

In [None]:
################################################################################
# Build model
################################################################################
class Net(Model):
    def __init__(self):
        super().__init__()
        # using message passing layer 
        # here three times, means the info will be converted and sumed from with three neighbors
        self.conv1 = GCSConv(32, activation="relu") 
        self.conv2 = GCSConv(32, activation="relu")
        self.conv3 = GCSConv(32, activation="relu")
        self.global_pool = GlobalAvgPool() # after message passing, how to combining them
        self.dense = Dense(data.n_labels, activation="softmax") # output layer

    def call(self, inputs):
        x, a, i = inputs
        x = self.conv1([x, a])
        x = self.conv2([x, a])
        # x = self.conv3([x, a])
        output = self.global_pool([x, i])
        output = self.dense(output)

        return output

model = Net()
optimizer = Adam(learning_rate=learning_rate) # optimizer
loss_fn = CategoricalCrossentropy() # loss function

In [None]:


################################################################################
# Fit model
################################################################################
@tf.function(input_signature=loader_tr.tf_signature(), experimental_relax_shapes=True)
def train_step(inputs, target):
    with tf.GradientTape() as tape:
        predictions = model(inputs, training=True)
        loss = loss_fn(target, predictions) + sum(model.losses)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    acc = tf.reduce_mean(categorical_accuracy(target, predictions))
    return loss, acc


def evaluate(loader):
    output = []
    step = 0
    while step < loader.steps_per_epoch:
        step += 1
        inputs, target = loader.__next__()
        pred = model(inputs, training=False)
        outs = (
            loss_fn(target, pred),
            tf.reduce_mean(categorical_accuracy(target, pred)),
            len(target),  # Keep track of batch size
        )
        output.append(outs)
        if step == loader.steps_per_epoch:
            output = np.array(output)
            return np.average(output[:, :-1], 0, weights=output[:, -1])


epoch = step = 0
best_val_loss = np.inf
best_weights = None
patience = es_patience
results = []
for batch in loader_tr:
    step += 1
    loss, acc = train_step(*batch)
    results.append((loss, acc))
    if step == loader_tr.steps_per_epoch:
        step = 0
        epoch += 1

        # Compute validation loss and accuracy
        val_loss, val_acc = evaluate(loader_va)
        print(
            "Ep. {} - Loss: {:.3f} - Acc: {:.3f} - Val loss: {:.3f} - Val acc: {:.3f}".format(
                epoch, *np.mean(results, 0), val_loss, val_acc
            )
        )

        # Check if loss improved for early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience = es_patience
            print("New best val_loss {:.3f}".format(val_loss))
            best_weights = model.get_weights()
        else:
            patience -= 1
            if patience == 0:
                print("Early stopping (best val_loss: {})".format(best_val_loss))
                break
        results = []

################################################################################
# Evaluate model
################################################################################
model.set_weights(best_weights)  # Load best model
test_loss, test_acc = evaluate(loader_te)
print("Done. Test loss: {:.4f}. Test acc: {:.2f}".format(test_loss, test_acc))