In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import scipy.sparse as sp
from sklearn.metrics import f1_score

## Data Preparation

### Cora Dataset

To test on a smaller dataset first

In [None]:
def sparse_mx_to_torch_sparse_tensor(sparse_mx):

    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    # get the indices of non-zero elements in the coo matrix
    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    # get the non-zero elements of the sparse matrix
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)

    return torch.sparse.FloatTensor(indices, values, shape)

In [None]:
def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index

In [None]:
import pickle
import scipy.sparse as sp
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder
import sys

def load_data(dataset_str):
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("/kaggle/input/cora-data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pickle.load(f, encoding='latin1'))
            else:  # python2
                objects.append(pickle.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)

    test_idx_reorder = parse_index_file("/kaggle/input/cora-data/ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)
    
    # construct labels vector
    labels = np.vstack((ally, ty))
    labels = np.where(labels)[1]
    labels = torch.LongTensor(labels)
    
    # construct feature matrix
    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    feature_matrix = torch.FloatTensor(np.array(features.todense()))

    # construct adjacency matrix
    num_nodes = features.shape[0]
    adj = sp.lil_matrix((num_nodes, num_nodes))
    for i in range(num_nodes):
        adj[i, graph[i]] = 1
        adj[graph[i], i] = 1

    # construct symmetric normalized laplacian
    rowsum = np.array(adj.sum(1))
    degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten())
    laplacian = adj.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo()

    # convert scipy sparse adjacency matrix to torch sparse tensor
    laplacian = sparse_mx_to_torch_sparse_tensor(laplacian)
    
    # construct train, validation, test split
    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y) + 500)
    
    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)
    idx_test = torch.LongTensor(idx_test)

    
    return laplacian, feature_matrix, labels, idx_train, idx_val, idx_test


laplacian, feature_matrix, labels, idx_train, idx_val, idx_test = load_data('cora')

## OGB Dataset

In [None]:
# this worked
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

In [None]:
!pip install ogb

In [None]:
from ogb.nodeproppred import PygNodePropPredDataset

In [None]:
dataset = PygNodePropPredDataset(name='ogbn-arxiv')

# Get features and labels from dataset

data = dataset[0]
features = data.x
labels = data.y.squeeze(1)

In [None]:
num_nodes = data.num_nodes
num_edges = data.edge_index.shape[1]
num_features = features.shape[1]

In [None]:


# create adjacency matrix as a scipy sparse matrix
adj = sp.coo_matrix((np.ones(data.edge_index.shape[1]), (data.edge_index[0], data.edge_index[1])), shape=(data.num_nodes, data.num_nodes), dtype=np.float32)

# self-loop edges and symmetrically normalize adjacency matrix
adj = adj + sp.eye(data.num_nodes)
rowsum = np.array(adj.sum(1))
degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten())
laplacian = adj.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo()

# convert scipy sparse adjacency matrix to torch sparse tensor
laplacian = sparse_mx_to_torch_sparse_tensor(laplacian)

# features
num_features = features.shape[1]
feature_matrix = features # the features are already in the form of a tensor

# labesl are already in the form of a long tensor with integer encoded labels
labels = labels

# get index splits 
idx_split = dataset.get_idx_split()
idx_train = idx_split['train']
idx_val = idx_split['valid']
idx_test = idx_split['test']

num_nodes = dataset.num_nodes

# get masks
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[idx_train] = True
val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
val_mask[idx_val] = True
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask[idx_test] = True


## Sampler

In [None]:
def ladies_sampling(laplacian, n_sample, batch_nodes, layers):

    # input: laplacian matrix, sample size, batch nodes, number of layers

    # output modified laplacians and sampled nodes
    n_nodes = laplacian.shape[0]

    # initialize sampled nodes of last layer
    sampled_nodes = batch_nodes
    # write sampled nodes as a row selection matrix
    Q = torch.zeros(len(sampled_nodes), n_nodes)
    Q[range(len(sampled_nodes)), sampled_nodes] = 1

    # initialize modified laplacians
    modified_laplacians = []

    # iterate over layers
    for l in range(layers):

        # get layer dependent laplacian 

        layer_laplacian = torch.sparse.mm(laplacian, Q.t())
        layer_laplacian = layer_laplacian.t()


        # compute probabilities of nodes
        probs = torch.sum(layer_laplacian ** 2, dim=0) / torch.sum(layer_laplacian ** 2)
        #print(probs.shape, 'probs shape')

        indices = torch.arange(probs.shape[0])
        indices = torch.stack((indices, indices))  # Stack the indices into a 2D tensor
        S_sparse = torch.sparse_coo_tensor(indices=indices, values=probs, size=(probs.shape[0], probs.shape[0]))


        s_l = min(n_sample, torch.sum(probs > 0).item())

        # sample for previous layer using probabilities
        sampled_nodes = np.random.choice(n_nodes, size=s_l, replace=True, p=probs.numpy())

        # write sampled nodes as a row selection matrix
        Q_previous = torch.zeros(s_l, n_nodes)
        Q_previous[range(s_l), sampled_nodes] = 1

        # compute modified laplacian
        modified_laplacian = torch.sparse.mm(layer_laplacian, S_sparse)
        modified_laplacian = torch.sparse.mm(modified_laplacian, Q_previous.t())
        

        Q = Q_previous
       
        # append modified laplacian
        modified_laplacians.append(modified_laplacian) 
        
    
    modified_laplacians.reverse()
    
    return modified_laplacians, sampled_nodes, batch_nodes    


In [None]:
def fastgcn_sampler(laplacian, n_sample, batch_nodes, layers):

    n_nodes = laplacian.shape[0]

    # initialize sampled nodes of last layer
    sampled_nodes = batch_nodes

    # initialize modified laplacians
    modified_laplacians = []

    
    Q = torch.zeros(len(sampled_nodes), n_nodes)
    Q[range(len(sampled_nodes)), sampled_nodes] = 1

    
    p = torch.sum(laplacian ** 2, dim=0) / torch.sum(laplacian ** 2)

    
    S = torch.sparse_coo_tensor(torch.stack((torch.arange(p.shape[0]), torch.arange(p.shape[0]))), p, (p.shape[0], p.shape[0]))
    

    # top-down sampling using probabilities 

    for l in range(layers):

        # row selction of laplacian using Q
        layer_laplacian = torch.sparse.mm(laplacian, Q.t())
        layer_laplacian = layer_laplacian.t()

        # sample based on probabilities p
        s_l = min(n_sample, torch.sum(p > 0).item())

        sampled_nodes = np.random.choice(n_nodes, size=s_l, replace=True, p=p.numpy())

        # write sampled nodes as a row selection matrix Q_previous
        Q_previous = torch.zeros(s_l, n_nodes)
        Q_previous[range(s_l), sampled_nodes] = 1

        # compute modified laplacian
        modified_laplacian = torch.sparse.mm(layer_laplacian, S)
        modified_laplacian = torch.sparse.mm(modified_laplacian, Q_previous.t())

        # update Q
        Q = Q_previous

        # append modified laplacian
        modified_laplacians.append(modified_laplacian)

    # reverse modified laplacians
    modified_laplacians.reverse()

    return modified_laplacians, sampled_nodes, batch_nodes
  

## GCN Model

In [None]:
class GraphConvolution(nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Linear(in_features, out_features, bias=bias)
        
    def forward(self, input, adj):
        
        output = self.weight(input)
        output = torch.spmm(adj, output)
        output = F.relu(output)

        return output
    
class GCN(nn.Module):
    def __init__(self, n_features, n_hidden, n_layers, dropout):
        super(GCN, self).__init__()
        self.n_features = n_features
        self.n_hidden = n_hidden
        self.n_layers = n_layers

        self.gcn = nn.ModuleList()
        self.gcn.append(GraphConvolution(n_features, n_hidden))
        for i in range(n_layers - 1):
            self.gcn.append(GraphConvolution(n_hidden, n_hidden))
        self.dropout = nn.Dropout(dropout)


    def forward(self, x, laplacians):

        # x: input feature matrix
        # laplacians: list of modified laplacian matrices

        for i in range(self.n_layers):
            #print(x.shape)
            #print(laplacians[i].shape)
            x = self.gcn[i](x, laplacians[i])
            x = self.dropout(x)
        output = torch.log_softmax(x, dim=1)
        #print('output shape', x.shape)
        
        return output

In [None]:
model = GCN(n_features=feature_matrix.shape[1], n_hidden=256, n_layers=2, dropout=0.5)

## Training

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# accuracy function

def accuracy(output, labels):

    # output: output of model
    # labels: labels of nodes

    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()

    return correct / len(labels)

In [None]:
# sequential sampling

def prepare_batches(num_batches, idx_train, n_sample, layers, batch_size):
    batches = []
    #shuffle training nodes
    idx_train = idx_train[torch.randperm(len(idx_train))]
    for i in range(num_batches):
        print('working')
        batch_nodes = idx_train[i * batch_size:(i + 1) * batch_size]
        modified_laplacians, sampled_nodes, batch_nodes = ladies_sampling(laplacian, n_sample, batch_nodes, layers)
        batches.append((modified_laplacians, sampled_nodes, batch_nodes))
    return batches


In [None]:
# parallel sampling

from concurrent.futures import ThreadPoolExecutor

def prepare_batches_parallel(num_batches, idx_train, n_sample, layers, batch_size):
    batches = []
    idx_train = idx_train[torch.randperm(len(idx_train))]

    with ThreadPoolExecutor() as executor:
        results = list(executor.map(lambda i: ladies_sampling(laplacian, n_sample, idx_train[i * batch_size:(i + 1) * batch_size], layers), range(num_batches)))

    for result in results:
        batches.append(result)

    return batches

In [None]:
torch.cuda.is_available()

In [None]:
## training with  sequential sampling
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

batch_size = 512
num_batches = len(idx_train) // batch_size
if len(idx_train) % batch_size != 0:
    num_batches += 1
    
num_epochs = 10

best_f1 = 0.0

for epoch in range(1, num_epochs):

    print('Epoch: {}'.format(epoch))
    
    start_time = time.time() 
    
    batches = prepare_batches_parallel(num_batches, idx_train, n_sample=512, layers=2, batch_size=512)
    
    # set model to training mode
    model.train()

    train_loss = []
    train_acc = []

    # iterate over training batches

    for i in range(num_batches):

        #print('batch running', i)
        
        #sampled_laplacians, sampled_nodes, batch_nodes = ladies_sampling(laplacian, n_sample=4, batch_nodes=batch, layers=2)
        sampled_laplacians, sampled_nodes, batch_nodes = batches[i]
        
        optimizer.zero_grad()

        feature_matrix_sampled = feature_matrix[sampled_nodes].to(device)

        sampled_laplacians = [laplacian.to(device) for laplacian in sampled_laplacians]

        # forward pass 
        output = model(feature_matrix_sampled, sampled_laplacians)
        
        loss = F.cross_entropy(output, labels[batch_nodes].to(device))

        # compute gradients
        loss.backward()

        # update parameters
        optimizer.step()

        # compute accuracy
        acc = accuracy(output, labels[batch_nodes])

        # append loss and accuracy
        train_loss.append(loss.item())
        train_acc.append(acc.item())
    
    end_time = time.time()
    elapsed_time = end_time - start_time 
    print('Elapsed time: {:.2f} seconds'.format(elapsed_time))

    print('Train loss: {:.4f}'.format(np.mean(train_loss)))

    print('Train accuracy: {:.4f}'.format(np.mean(train_acc)))
    

    # set model to evaluation mode
    model.eval()

    val_loss = []
    val_acc = []
    
    feature_matrix = feature_matrix.to(device)
    # full batch evaluation
    fullbatch_laplacians = [laplacian, laplacian]
    fullbatch_laplacians = [laplacian.to(device) for laplacian in fullbatch_laplacians]
    
    output = model(feature_matrix, fullbatch_laplacians)

    loss = F.cross_entropy(output[idx_val], labels[idx_val].to(device))
    acc = accuracy(output[idx_val], labels[idx_val])
    valid_f1 = f1_score(output[idx_val].argmax(dim=1).cpu(), labels[idx_val].cpu(), average='micro')
    print(valid_f1, 'f1-score')
    
    # save best model
    if valid_f1 > best_f1 :
        best_f1 = valid_f1
        torch.save(model.state_dict(), 'best_model.pth')

    # append loss and accuracy
    val_loss.append(loss.item())
    val_acc.append(acc.item())

    print('Validation loss: {:.4f}'.format(np.mean(val_loss)))
    print('Validation accuracy: {:.4f}'.format(np.mean(val_acc)))    

In [None]:
# Test the best model on the idx_test set

model.load_state_dict(torch.load('best_model.pth'))

model.eval()

feature_matrix = feature_matrix.to(device)

fullbatch_laplacians = [laplacian, laplacian]
fullbatch_laplacians = [laplacian.to(device) for laplacian in fullbatch_laplacians] 

# evaluation is done without sampling
output = model(feature_matrix, fullbatch_laplacians)

predicted_labels = torch.argmax(output, dim=1)
predicted_labels = predicted_labels.type(torch.FloatTensor)

# compute f1 score

test_f1 = f1_score(predicted_labels[idx_test].cpu(), labels[idx_test].cpu(), average='micro')

print('Test f1 score: {:.4f}'.format(test_f1))

