# using GNN with the Cora-Dataset

Tutorial / Code taken from https://www.youtube.com/watch?v=8owQBFAHw7E

https://relational.fit.cvut.cz/dataset/CORA
"The Cora dataset consists of 2708 scientific publications classified into one of seven classes. The citation network consists of 5429 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words."

In [34]:
!pip install spektral
!pip install tensorflow
!pip install numpy





In [35]:
import tensorflow as tf
import numpy as np
import spektral as spektral

In [36]:
# import cora dataset
cora_dataset = spektral.datasets.citation.Citation(name='cora')

test_mask = cora_dataset.mask_te
train_mask = cora_dataset.mask_tr
val_mask = cora_dataset.mask_va

graph = cora_dataset.graphs[0] # cora has only one dataset

features = graph.x # the features are a one-hot encoding of the 1433 words 
adj = graph.a # adjacency matrix (element at ij == 1 --> doc i has cited doc j)
labels = graph.y # one hot encoding of topics (7 topics)

adj = adj.todense() # convert to dense representation
adj = adj + np.eye(adj.shape[0]) # add identity matrix so that every element is a neighbor of itself

adj = adj.astype('float32')

features = features.astype('float32')

print(np.sum(train_mask))
print(np.sum(val_mask))
print(np.sum(test_mask))


140
500
1000


In [37]:
# return cross entropy of only masked entites (ignoring some values)
# logits are the predicted probability
# labels are actual labels
# mask are the elements that count
def masked_softmax_cross_entropy(logits, labels, mask): 
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels)
    mask = tf.cast(mask, dtype=tf.float32)
    mask /= tf.reduce_mean(mask)
    loss *= mask
    return tf.reduce_mean(loss)

# return accuracy of only masked entities
def masked_accuracy(logits, labels, mask): 
    correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
    accuraccy_all = tf.cast(correct_prediction, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    mask /= tf.reduce_mean(mask)
    accuraccy_all *= mask
    return tf.reduce_mean(accuraccy_all)

In [38]:
# basic gnn framework
def gnn(fts, adj, transform, activation): 
    seq_fts = transform(fts) # transform each node individually
    ret_fts = tf.matmul(adj, seq_fts) # recombine across neigborhoods
    return activation(ret_fts) # apply activation function

In [39]:
def train_cora(fts, adj, gnn_fn, units, epochs, lr):
    lyr_1 = tf.keras.layers.Dense(units) # hidden layer
    lyr_2 = tf.keras.layers.Dense(7) # output layer. 7 topics == 7 outputs
    
    def cora_gnn(fts, adj):
        hidden = gnn_fn(fts, adj, lyr_1, tf.nn.relu)
        logits = gnn_fn(hidden, adj, lyr_2, tf.identity)
        return logits
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    best_accuracy = 0.0
    
    # training throug epochs
    for ep in range (epochs + 1):
        with tf.GradientTape() as t: # use gradient tape for auto diff for backpropagation
            logits = cora_gnn(fts, adj) # make predictions
            loss = masked_softmax_cross_entropy(logits, labels, train_mask) # calculate loss
            
        variables = t.watched_variables()
        grads = t.gradient(loss, variables)
        optimizer.apply_gradients(zip(grads, variables))
        
        logits = cora_gnn(fts, adj)
        val_accuracy = masked_accuracy(logits, labels, val_mask) # validation and test accuracy
        test_accuracy = masked_accuracy(logits, labels, test_mask)
        
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            print( 'Epoch', ep, 
                  '| Training loss:', loss.numpy(), 
                  '| Val acuracy:', val_accuracy.numpy(), 
                  '| Test accuracy:', test_accuracy.numpy())

In [40]:
train_cora(features, adj, gnn, 32, 200, 0.01)

Epoch 0 | Training loss: 5.0037503 | Val acuracy: 0.42599997 | Test accuracy: 0.43299994
Epoch 2 | Training loss: 3.3168325 | Val acuracy: 0.42799997 | Test accuracy: 0.447
Epoch 3 | Training loss: 1.7218795 | Val acuracy: 0.528 | Test accuracy: 0.54099995
Epoch 4 | Training loss: 1.4378206 | Val acuracy: 0.63 | Test accuracy: 0.6319999
Epoch 5 | Training loss: 0.84919816 | Val acuracy: 0.6779999 | Test accuracy: 0.678
Epoch 6 | Training loss: 0.56004775 | Val acuracy: 0.688 | Test accuracy: 0.6929999
Epoch 9 | Training loss: 0.35979638 | Val acuracy: 0.70399994 | Test accuracy: 0.69799995
Epoch 10 | Training loss: 0.27656454 | Val acuracy: 0.71199995 | Test accuracy: 0.70699996
Epoch 21 | Training loss: 0.048452117 | Val acuracy: 0.71799994 | Test accuracy: 0.71699995
Epoch 22 | Training loss: 0.04216026 | Val acuracy: 0.72199994 | Test accuracy: 0.71799994
Epoch 23 | Training loss: 0.037084024 | Val acuracy: 0.7239999 | Test accuracy: 0.7199999
Epoch 24 | Training loss: 0.033030905 |