In [None]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [93]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np

import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.transforms import Compose
from torch_geometric.datasets import Amazon
from torch_geometric.transforms.random_node_split import RandomNodeSplit
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.nn import GCNConv
from torch_geometric.nn import GATConv
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import SAGEConv
from sklearn.metrics import roc_auc_score

from torch_geometric.utils import negative_sampling
from torch_geometric.utils import train_test_split_edges

from copy import deepcopy
import torch.nn as nn
from IPython.display import Javascript  # Restrict height of output cell.

In [94]:
dataset_name='Flickr'
#dataset_name='Amazon'
#dataset_name='Cora'
#dataset_name='Citeseer'
#dataset_name='Pubmed'


seeds = [1234567, 12345]

In [95]:
from torch_geometric.datasets import Planetoid, Flickr, Amazon
from torch_geometric.transforms import NormalizeFeatures


if dataset_name=='Flickr':
    transform = Compose([
        #NormalizeFeatures(),
        RandomNodeSplit('train_rest',num_val = 2000, num_test = 10000)
    ])
    dataset = Flickr(root='data/Flickr', \
                     transform =transform)
elif dataset_name=='Amazon':
    transform = Compose([
        #NormalizeFeatures(),
        RandomNodeSplit('train_rest',num_val = 1000, num_test = 3000)
    ])
    dataset = Amazon(root='data/Amazon', name='Computers', \
                     transform =transform)

elif dataset_name in ['Cora', 'Citeseer', 'Pubmed']:
    # For Planetoid datasets, the standard split is already defined
    dataset = Planetoid(root=f'data/{dataset_name}', name=dataset_name)

else:
    raise ValueError(f"Unknown dataset: {dataset_name}")

print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Downloading https://docs.google.com/uc?export=download&id=1crmsTbd1-2sEXsGwa2IKnIB7Zd3TmUsy&confirm=t
Downloading https://docs.google.com/uc?export=download&id=1join-XdvX3anJU_MLVtick7MgeAQiWIZ&confirm=t
Downloading https://docs.google.com/uc?export=download&id=1uxIkbtg5drHTsKt-PAsZZ4_yJmgFmle9&confirm=t
Downloading https://docs.google.com/uc?export=download&id=1htXCtuktuCW8TR8KiKfrFDAxUgekQoV7&confirm=t
Processing...
Done!



Dataset: Flickr():
Number of graphs: 1
Number of features: 500
Number of classes: 7

Data(x=[89250, 500], edge_index=[2, 899756], y=[89250], train_mask=[89250], val_mask=[89250], test_mask=[89250])
Number of nodes: 89250
Number of edges: 899756
Average node degree: 10.08
Number of training nodes: 77250
Training node label rate: 0.87
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [96]:
from torch_geometric.nn import GATConv


class GAT(torch.nn.Module):
    def __init__(self, seed, hidden_channels, out_channels, num_layers, heads=8, dropout=0.3):
        super().__init__()
        torch.manual_seed(seed)
        self.feature_vals = {}
        self.layers = nn.ModuleList()
        self.num_layers = num_layers
        self.hidden_channels = hidden_channels
        self.out_channels = out_channels
        self.dropout = dropout
        self.layers.append(GATConv(dataset.num_features, hidden_channels, heads=heads))
        for i in range(num_layers-2):
            self.layers.append(GATConv(hidden_channels*heads, hidden_channels, heads=heads))
        self.layers.append(GATConv(hidden_channels*heads, out_channels, heads=1))

    def forward(self, x, edge_index):
        for i,layer in enumerate(self.layers):
            x = layer(x, edge_index)
            if i!= (len(self.layers)-1):
                x = F.elu(x)
                x = F.dropout(x, p=self.dropout, training=self.training)
            self.feature_vals['conv'+str(i)] = deepcopy(x.detach().cpu().numpy())
        return x

In [97]:
def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x, data.edge_index)  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test():
      model.eval()
      out = model(data.x, data.edge_index)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc


In [100]:
run_ids=[1,2]
for i,run_id in enumerate(run_ids):
    config = {
        "model_name":"GAT",
        "task":"NC",
        "run_id":run_id,
        "dataset":dataset_name
    }
    path = 'model_data/'+config['dataset']+"/"+config['model_name']+"/"
    !mkdir -p $path
    path2 = 'model_data/'+config['dataset']+"/"+config['model_name']+"/"+config['task']+"_"+str(config['run_id'])+"*"
    !rm $path2
    model = GAT(seeds[i],hidden_channels=8, out_channels=dataset.num_classes,num_layers=4,heads=8,dropout=0.2)
    model, data = model.to(device), data.to(device)
    print(model)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
    criterion = torch.nn.CrossEntropyLoss()

    loss_list = []
    test_acc_list = []
    for epoch in range(1, 201):
        loss = train()
        loss_list.append(loss)
        test_acc = test()
        test_acc_list.append(test_acc)
        feature_vals = deepcopy(model.feature_vals)
        feature_path =  path+config['task']+"_"+str(config['run_id'])+'_'+str(epoch)+'.npz'
        np.savez(feature_path, **feature_vals)
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

GAT(
  (layers): ModuleList(
    (0): GATConv(500, 8, heads=8)
    (1): GATConv(64, 8, heads=8)
    (2): GATConv(64, 8, heads=8)
    (3): GATConv(64, 7, heads=1)
  )
)
Epoch: 001, Loss: 30.7490
Epoch: 002, Loss: 26.3083
Epoch: 003, Loss: 22.9751
Epoch: 004, Loss: 25.9306
Epoch: 005, Loss: 14.3872
Epoch: 006, Loss: 12.2486
Epoch: 007, Loss: 9.0220
Epoch: 008, Loss: 8.4570
Epoch: 009, Loss: 7.6122
Epoch: 010, Loss: 7.1342
Epoch: 011, Loss: 7.3789
Epoch: 012, Loss: 6.8383
Epoch: 013, Loss: 6.1767
Epoch: 014, Loss: 5.4838
Epoch: 015, Loss: 5.1888
Epoch: 016, Loss: 4.8787
Epoch: 017, Loss: 5.2052
Epoch: 018, Loss: 4.4151
Epoch: 019, Loss: 4.3354
Epoch: 020, Loss: 3.9293
Epoch: 021, Loss: 3.8187
Epoch: 022, Loss: 3.9186
Epoch: 023, Loss: 3.8463
Epoch: 024, Loss: 3.5450
Epoch: 025, Loss: 3.6047
Epoch: 026, Loss: 3.7084
Epoch: 027, Loss: 3.5196
Epoch: 028, Loss: 3.5735
Epoch: 029, Loss: 3.1124
Epoch: 030, Loss: 3.0824
Epoch: 031, Loss: 3.4738
Epoch: 032, Loss: 3.3428
Epoch: 033, Loss: 3.2048
E

In [102]:
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.4841


In [84]:
# #from matplotlib.pyplot import plt
# plt.figure(figsize=(20,8))
# plt.plot(test_acc_list)
# plt.title("Accuracy Over Epochs", fontsize=25)
# plt.xlabel("Epochs", fontsize=25)
# plt.ylabel("Test Accuracy", fontsize=25)
# output_filename = f'GAT_NC_test_accuracy.png'
# # Save the heatmap plot as an image
# plt.xticks(fontsize=25)
# plt.yticks(fontsize=25)
# plt.savefig(output_filename, bbox_inches='tight')
# #plt.show()
# plt.close()  # Close the plot to release resources

In [103]:
for k in model.feature_vals.keys():
    print(model.feature_vals[k].shape)

(89250, 64)
(89250, 64)
(89250, 64)
(89250, 7)


## Link Prediction

In [86]:
from torch_geometric.datasets import Planetoid, Flickr, Amazon
from torch_geometric.transforms import NormalizeFeatures


if dataset_name=='Flickr':
    transform = Compose([
        #NormalizeFeatures(),
        RandomNodeSplit('train_rest',num_val = 2000, num_test = 10000)
    ])
    dataset = Flickr(root='data/Flickr', \
                     transform =transform)
elif dataset_name=='Amazon':
    transform = Compose([
        #NormalizeFeatures(),
        RandomNodeSplit('train_rest',num_val = 1000, num_test = 3000)
    ])
    dataset = Amazon(root='data/Amazon', name='Computers', \
                     transform =transform)

elif dataset_name in ['Cora', 'Citeseer', 'Pubmed']:
    # For Planetoid datasets, the standard split is already defined
    dataset = Planetoid(root=f'data/{dataset_name}', name=dataset_name)

else:
    raise ValueError(f"Unknown dataset: {dataset_name}")


print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: Pubmed():
Number of graphs: 1
Number of features: 500
Number of classes: 3

Data(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717])
Number of nodes: 19717
Number of edges: 88648
Average node degree: 4.50
Number of training nodes: 60
Training node label rate: 0.00
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [87]:
data.train_mask = data.val_mask = data.test_mask = data.y = None
data = train_test_split_edges(data)
print(data)



Data(x=[19717, 500], val_pos_edge_index=[2, 2216], test_pos_edge_index=[2, 4432], train_pos_edge_index=[2, 75352], train_neg_adj_mask=[19717, 19717], val_neg_edge_index=[2, 2216], test_neg_edge_index=[2, 4432])


In [88]:
class Net(torch.nn.Module):
    def __init__(self, seed, hidden_channels, out_channels, num_layers, heads=8, dropout=0.3):
        super().__init__()
        torch.manual_seed(seed)
        self.feature_vals = {}
        self.layers = nn.ModuleList()
        self.num_layers = num_layers
        self.hidden_channels = hidden_channels
        self.out_channels = out_channels
        self.dropout = dropout
        self.layers.append(GATConv(dataset.num_features, hidden_channels, heads=heads))
        for i in range(num_layers-2):
            self.layers.append(GATConv(hidden_channels*heads, hidden_channels, heads=heads))
        self.layers.append(GATConv(hidden_channels*heads, out_channels, heads=heads))

    def encode(self, x, edge_index):
        for i,layer in enumerate(self.layers):
            x = layer(x, edge_index)
            if i!= (len(self.layers)-1):
                x = F.elu(x, alpha=1)
                x = F.dropout(x, p=self.dropout, training=self.training)
            self.feature_vals['conv'+str(i)] = deepcopy(x.detach().cpu().numpy())
        return x


    def decode(self, z, pos_edge_index, neg_edge_index): # only pos and neg edges
        edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1) # concatenate pos and neg edges
        logits = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)  # dot product 
        return logits

    def decode_all(self, z): 
        prob_adj = z @ z.t() # get adj NxN
        return (prob_adj > 0).nonzero(as_tuple=False).t() # get predicted edge_list 

In [89]:
def get_link_labels(pos_edge_index, neg_edge_index):
    # returns a tensor:
    # [1,1,1,1,...,0,0,0,0,0,..] with the number of ones is equel to the length of pos_edge_index
    # and the number of zeros is equal to the length of neg_edge_index
    E = pos_edge_index.size(1) + neg_edge_index.size(1)
    link_labels = torch.zeros(E, dtype=torch.float, device=device)
    link_labels[:pos_edge_index.size(1)] = 1.
    return link_labels


def train():
    model.train()

    neg_edge_index = negative_sampling(
        edge_index=data.train_pos_edge_index, #positive edges
        num_nodes=data.num_nodes, # number of nodes
        num_neg_samples=data.train_pos_edge_index.size(1)) # number of neg_sample equal to number of pos_edges

    optimizer.zero_grad()
    
    z = model.encode(data.x, data.train_pos_edge_index) #encode
    link_logits = model.decode(z, data.train_pos_edge_index, neg_edge_index) # decode
    
    link_labels = get_link_labels(data.train_pos_edge_index, neg_edge_index)
    loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
    loss.backward()
    optimizer.step()

    return loss


@torch.no_grad()
def test():
    model.eval()
    perfs = []
    for prefix in ["val", "test"]:
        pos_edge_index = data[f'{prefix}_pos_edge_index']
        neg_edge_index = data[f'{prefix}_neg_edge_index']

        z = model.encode(data.x, data.train_pos_edge_index) # encode train
        link_logits = model.decode(z, pos_edge_index, neg_edge_index) # decode test or val
        link_probs = link_logits.sigmoid() # apply sigmoid
        
        link_labels = get_link_labels(pos_edge_index, neg_edge_index) # get link
        
        perfs.append(roc_auc_score(link_labels.cpu(), link_probs.cpu())) #compute roc_auc score
    return perfs

In [90]:
run_ids=[1,2]
for i,run_id in enumerate(run_ids):
    config = {
        "model_name":"GAT",
        "task":"LP",
        "run_id":run_id,
        "dataset":dataset_name
    }
    path = 'model_data/'+config['dataset']+"/"+config['model_name']+"/"
    !mkdir -p $path
    path2 = 'model_data/'+config['dataset']+"/"+config['model_name']+"/"+config['task']+"_"+str(config['run_id'])+"*"
    model, data = Net(seeds[i],hidden_channels=8,out_channels=8,num_layers=4, heads=8, dropout=0).to(device), data.to(device)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4)
    test_acc_list=[]
    !rm $path2
    best_val_perf = test_perf = 0
    for epoch in range(1, 201):
        train_loss = train()
        val_perf, tmp_test_perf = test()
        test_acc_list.append(tmp_test_perf)
        if val_perf > best_val_perf:
            best_val_perf = val_perf
            test_perf = tmp_test_perf
        log = 'Epoch: {:03d}, Loss: {:.4f}, Val: {:.4f}, Test: {:.4f}'
        feature_vals = deepcopy(model.feature_vals)
        feature_path =  path+config['task']+"_"+str(config['run_id'])+'_'+str(epoch)+'.npz'
        np.savez(feature_path, **feature_vals)
        if epoch % 10 == 0:
            print(log.format(epoch, train_loss, best_val_perf, test_perf))

rm: cannot remove ‘model_data/Pubmed/GAT/LP_1*’: No such file or directory
Epoch: 010, Loss: 0.6909, Val: 0.7337, Test: 0.7395
Epoch: 020, Loss: 0.6892, Val: 0.7337, Test: 0.7395
Epoch: 030, Loss: 0.6866, Val: 0.7337, Test: 0.7395
Epoch: 040, Loss: 0.6828, Val: 0.7337, Test: 0.7395
Epoch: 050, Loss: 0.6771, Val: 0.7337, Test: 0.7395
Epoch: 060, Loss: 0.6686, Val: 0.7337, Test: 0.7395
Epoch: 070, Loss: 0.6559, Val: 0.7479, Test: 0.7501
Epoch: 080, Loss: 0.6370, Val: 0.7684, Test: 0.7792
Epoch: 090, Loss: 0.6148, Val: 0.7684, Test: 0.7792
Epoch: 100, Loss: 0.5941, Val: 0.7684, Test: 0.7792
Epoch: 110, Loss: 0.5846, Val: 0.7684, Test: 0.7792
Epoch: 120, Loss: 0.5841, Val: 0.7684, Test: 0.7792
Epoch: 130, Loss: 0.5811, Val: 0.7684, Test: 0.7792
Epoch: 140, Loss: 0.5815, Val: 0.7684, Test: 0.7792
Epoch: 150, Loss: 0.5790, Val: 0.7684, Test: 0.7792
Epoch: 160, Loss: 0.5779, Val: 0.7684, Test: 0.7792
Epoch: 170, Loss: 0.5767, Val: 0.7684, Test: 0.7762
Epoch: 180, Loss: 0.5770, Val: 0.7700, Te

In [91]:
# z = model.encode(data.x, data.train_pos_edge_index)
# final_edge_index = model.decode_all(z)

In [92]:
for k in model.feature_vals.keys():
    print(model.feature_vals[k].shape)

(19717, 64)
(19717, 64)
(19717, 64)
(19717, 64)


In [77]:
# #from matplotlib.pyplot import plt
# plt.figure(figsize=(20,8))
# plt.plot(test_acc_list)
# plt.title("Accuracy Over Epochs", fontsize=25)
# plt.xlabel("Epochs", fontsize=25)
# plt.ylabel("Test Accuracy", fontsize=25)
# output_filename = f'GAT_LP_test_accuracy.png'
# # Save the heatmap plot as an image
# plt.xticks(fontsize=25)
# plt.yticks(fontsize=25)
# plt.savefig(output_filename, bbox_inches='tight')
# #plt.show()
# plt.close()  # Close the plot to release resources