In [None]:
import os
import torch
import numpy as np
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.transforms import Compose
from torch_geometric.datasets import Amazon
from torch_geometric.transforms.random_node_split import RandomNodeSplit
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.nn import GCNConv
from torch_geometric.nn import GATConv
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import SAGEConv
from sklearn.metrics import roc_auc_score

from torch_geometric.utils import negative_sampling
from torch_geometric.utils import train_test_split_edges

from copy import deepcopy
import torch.nn as nn
from IPython.display import Javascript  # Restrict height of output cell.

In [None]:
dataset_name='Flickr'
##dataset_name='Amazon'
#dataset_name='Cora'
#dataset_name='Citeseer'
#dataset_name='Pubmed'

seeds = [1234567,12345]

In [None]:
from torch_geometric.datasets import Planetoid, Flickr, Amazon
from torch_geometric.transforms import NormalizeFeatures


if dataset_name=='Flickr':
    transform = Compose([
        #NormalizeFeatures(),
        RandomNodeSplit('train_rest',num_val = 2000, num_test = 10000)
    ])
    dataset = Flickr(root='data/Flickr', \
                     transform =transform)
elif dataset_name=='Amazon':
    transform = Compose([
        #NormalizeFeatures(),
        RandomNodeSplit('train_rest',num_val = 1000, num_test = 3000)
    ])
    dataset = Amazon(root='data/Amazon', name='Computers', \
                     transform =transform)

elif dataset_name in ['Cora', 'Citeseer', 'Pubmed']:
    # For Planetoid datasets, the standard split is already defined
    dataset = Planetoid(root=f'data/{dataset_name}', name=dataset_name)

else:
    raise ValueError(f"Unknown dataset: {dataset_name}")

print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

In [None]:
from torch_geometric.loader import ClusterData, ClusterLoader

torch.manual_seed(123)
cluster_data = ClusterData(data, num_parts=128)  # 1. Create subgraphs.
train_loader = ClusterLoader(cluster_data, batch_size=32, shuffle=True)  # 2. Stochastic partioning scheme.

print()
total_num_nodes = 0
labels=[]
for step, sub_data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of nodes in the current batch: {sub_data.num_nodes}')
    print(sub_data)
    print()
    total_num_nodes += sub_data.num_nodes
    labels.append(sub_data.y)

print(f'Iterated over {total_num_nodes} of {data.num_nodes} nodes!')

In [None]:
class CGCN(torch.nn.Module):
    def __init__(self, seed, hidden_channels, out_channels, num_layers, dropout=0.3):
        super().__init__()
        torch.manual_seed(seed)
        self.feature_vals = {}
        self.layers = nn.ModuleList()
        self.num_layers = num_layers
        self.hidden_channels = hidden_channels
        self.out_channels = out_channels
        self.dropout = dropout
        self.layers.append(GCNConv(dataset.num_features, hidden_channels))
        for i in range(num_layers-2):
            self.layers.append(GCNConv(hidden_channels, hidden_channels))
        self.layers.append(GCNConv(hidden_channels, out_channels))

    def forward(self, x, edge_index):
        for i,layer in enumerate(self.layers):
            x = layer(x, edge_index)
            if i!= (len(self.layers)-1):
                x = x.relu()
                x = F.dropout(x, p=self.dropout, training=self.training)
        return x

    def inference(self, x, edge_index):
        for i,layer in enumerate(self.layers):
            x = layer(x, edge_index)
            if i!= (len(self.layers)-1):
                x = x.relu()
                x = F.dropout(x, p=self.dropout, training=self.training)
            self.feature_vals['conv'+str(i)] = deepcopy(x.detach().cpu().numpy())
        return x

In [None]:

def train():
    model.train()
    final_loss = 0
    for sub_data in train_loader:
        sub_data = sub_data.to(device)
        out = model(sub_data.x, sub_data.edge_index)
        loss = criterion(out[sub_data.train_mask], sub_data.y[sub_data.train_mask])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        final_loss+=loss
        
    return final_loss

def test():
    model.eval()
    out = model.inference(data.x, data.edge_index)
    pred = out.argmax(dim=1)  # Use the class with highest probability.
    accs = []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
      correct = pred[mask] == data.y[mask]  # Check against ground-truth labels.
      accs.append(int(correct.sum()) / int(mask.sum()))  # Derive ratio of correct predictions.
    return accs

In [None]:
run_ids=[1,2]
for i,run_id in enumerate(run_ids):
    config = {
        "model_name":"CGCN",
        "task":"NC",
        "run_id":run_id,
        "dataset":dataset_name
    }
    path = 'model_data/'+config['dataset']+"/"+config['model_name']+"/"
    !mkdir -p $path
    path2 = 'model_data/'+config['dataset']+"/"+config['model_name']+"/"+config['task']+"_"+str(config['run_id'])+"*"
    !rm $path2
    model = CGCN(seeds[i],hidden_channels=64, out_channels=dataset.num_classes,num_layers=4,dropout=0.2)
    model, data = model.to(device), data.to(device)
    print(model)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
    criterion = torch.nn.CrossEntropyLoss()

    loss_list = []
    test_acc_list = []
    for epoch in range(1, 101):
        loss = train()
        loss_list.append(loss)
        train_acc, val_acc, test_acc = test()
        test_acc_list.append(test_acc)
        feature_vals = deepcopy(model.feature_vals)
        feature_path =  path+config['task']+"_"+str(config['run_id'])+'_'+str(epoch)+'.npz'
        np.savez(feature_path, **feature_vals)
    
        print(f'Epoch: {epoch:03d}, Train: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}')
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

In [None]:
train_acc, val_acc, test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

In [None]:
# #from matplotlib.pyplot import plt
# plt.figure(figsize=(20,8))
# plt.plot(test_acc_list)
# plt.title("Accuracy Over Epochs", fontsize=25)
# plt.xlabel("Epochs", fontsize=25)
# plt.ylabel("Test Accuracy", fontsize=25)
# output_filename = f'CGCN_NC_test_accuracy.png'
# # Save the heatmap plot as an image
# plt.xticks(fontsize=25)
# plt.yticks(fontsize=25)
# plt.savefig(output_filename, bbox_inches='tight')
# #plt.show()
# plt.close()  # Close the plot to release resources

In [None]:
for k in model.feature_vals.keys():
    print(model.feature_vals[k].shape)

## Link Prediction

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import warnings

# Use the warnings filter to ignore specific warning categories or all warnings
# To ignore all warnings (not recommended for production code):
warnings.filterwarnings("ignore")

In [None]:
from torch_geometric.datasets import Planetoid, Flickr, Amazon
from torch_geometric.transforms import NormalizeFeatures


if dataset_name=='Flickr':
    transform = Compose([
        #NormalizeFeatures(),
        RandomNodeSplit('train_rest',num_val = 2000, num_test = 10000)
    ])
    dataset = Flickr(root='data/Flickr', \
                     transform =transform)
elif dataset_name=='Amazon':
    transform = Compose([
        #NormalizeFeatures(),
        RandomNodeSplit('train_rest',num_val = 1000, num_test = 3000)
    ])
    dataset = Amazon(root='data/Amazon', name='Computers', \
                     transform =transform)
elif dataset_name in ['Cora', 'Citeseer', 'Pubmed']:
    # For Planetoid datasets, the standard split is already defined
    dataset = Planetoid(root=f'data/{dataset_name}', name=dataset_name)

else:
    raise ValueError(f"Unknown dataset: {dataset_name}")


print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

In [None]:
from torch_geometric.loader import ClusterData, ClusterLoader

torch.manual_seed(123)
cluster_data = ClusterData(data, num_parts=128)  # 1. Create subgraphs.
train_loader = ClusterLoader(cluster_data, batch_size=32, shuffle=True)  # 2. Stochastic partioning scheme.

print()
total_num_nodes = 0
labels=[]
for step, sub_data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of nodes in the current batch: {sub_data.num_nodes}')
    print(sub_data)
    print()
    total_num_nodes += sub_data.num_nodes
    labels.append(sub_data.y)

print(f'Iterated over {total_num_nodes} of {data.num_nodes} nodes!')

In [None]:
data.train_mask = data.val_mask = data.test_mask = data.y = None
data = train_test_split_edges(data)
print(data)

In [None]:
class Net(torch.nn.Module):
    def __init__(self, seed, hidden_channels, out_channels, num_layers, dropout=0.3):
        super(Net, self).__init__()
        super().__init__()
        torch.manual_seed(seed)
        self.feature_vals = {}
        self.layers = nn.ModuleList()
        self.num_layers = num_layers
        self.hidden_channels = hidden_channels
        self.out_channels = out_channels
        self.dropout = dropout
        self.layers.append(GCNConv(dataset.num_features, hidden_channels))
        for i in range(num_layers-2):
            self.layers.append(GCNConv(hidden_channels, hidden_channels))
        self.layers.append(GCNConv(hidden_channels, out_channels))

    def encode(self, x, edge_index):
        for i,layer in enumerate(self.layers):
            x = layer(x, edge_index)
            if i!= (len(self.layers)-1):
                x = x.relu()
                x = F.dropout(x, p=self.dropout, training=self.training)
            self.feature_vals['conv'+str(i)] = deepcopy(x.detach().cpu().numpy())
        return x


    def encode_infer(self, x, edge_index):
        for i,layer in enumerate(self.layers):
            x = layer(x, edge_index)
            if i!= (len(self.layers)-1):
                x = x.relu()
                x = F.dropout(x, p=self.dropout, training=self.training)
            self.feature_vals['conv'+str(i)] = deepcopy(x.detach().cpu().numpy())
        return x

    def decode(self, z, pos_edge_index, neg_edge_index): # only pos and neg edges
        edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1) # concatenate pos and neg edges
        logits = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)  # dot product 
        return logits

    def decode_all(self, z): 
        prob_adj = z @ z.t() # get adj NxN
        return (prob_adj > 0).nonzero(as_tuple=False).t() # get predicted edge_list 

In [None]:
def get_link_labels(pos_edge_index, neg_edge_index):
    # returns a tensor:
    # [1,1,1,1,...,0,0,0,0,0,..] with the number of ones is equel to the length of pos_edge_index
    # and the number of zeros is equal to the length of neg_edge_index
    E = pos_edge_index.size(1) + neg_edge_index.size(1)
    link_labels = torch.zeros(E, dtype=torch.float, device=device)
    link_labels[:pos_edge_index.size(1)] = 1.
    return link_labels


def train():
    model.train()
    final_loss=0
    for sub_data in train_loader:  # Iterate over each mini-batch.
        sub_data = sub_data.to(device)
        sub_data.train_mask = sub_data.val_mask = sub_data.test_mask = sub_data.y = None
        sub_data = train_test_split_edges(sub_data)
        
        neg_edge_index = negative_sampling(
            edge_index=sub_data.train_pos_edge_index, #positive edges
            num_nodes=sub_data.num_nodes, # number of nodes
            num_neg_samples=sub_data.train_pos_edge_index.size(1)) # number of neg_sample equal to number of pos_edges

        optimizer.zero_grad()
    
        z = model.encode(sub_data.x, sub_data.train_pos_edge_index) #encode
        link_logits = model.decode(z, sub_data.train_pos_edge_index, neg_edge_index) # decode
    
        link_labels = get_link_labels(sub_data.train_pos_edge_index, neg_edge_index)
        loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
        loss.backward()
        optimizer.step()
        final_loss+=loss
    return final_loss


@torch.no_grad()
def test():
    model.eval()
    perfs = []
    z = model.encode_infer(data.x, data.train_pos_edge_index) # encode train
    for prefix in ["val", "test"]:
        pos_edge_index = data[f'{prefix}_pos_edge_index']
        neg_edge_index = data[f'{prefix}_neg_edge_index']
        link_logits = model.decode(z, pos_edge_index, neg_edge_index) # decode test or val
        link_probs = link_logits.sigmoid() # apply sigmoid
        
        link_labels = get_link_labels(pos_edge_index, neg_edge_index) # get link
        
        perfs.append(roc_auc_score(link_labels.cpu(), link_probs.cpu())) #compute roc_auc score
    return perfs

In [None]:
run_ids=[1,2]
for i,run_id in enumerate(run_ids):
    config = {
        "model_name":"CGCN",
        "task":"LP",
        "run_id":run_id,
        "dataset":dataset_name
    }
    path = 'model_data/'+config['dataset']+"/"+config['model_name']+"/"
    !mkdir -p $path
    path2 = 'model_data/'+config['dataset']+"/"+config['model_name']+"/"+config['task']+"_"+str(config['run_id'])+"*"
    !rm $path2
    model, data = Net(seeds[i],hidden_channels=64,out_channels=64,num_layers=4,dropout=0).to(device), data.to(device)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001, weight_decay=5e-4)
    best_val_perf = test_perf = 0
    test_acc_list=[]
    for epoch in range(1, 201):
        train_loss = train()
        val_perf, tmp_test_perf = test()
        test_acc_list.append(tmp_test_perf)
        if val_perf > best_val_perf:
            best_val_perf = val_perf
            test_perf = tmp_test_perf
        log = 'Epoch: {:03d}, Loss: {:.4f}, Val: {:.4f}, Test: {:.4f}'
        feature_vals = deepcopy(model.feature_vals)
        feature_path =  path+config['task']+"_"+str(config['run_id'])+'_'+str(epoch)+'.npz'
        np.savez(feature_path, **feature_vals)
        if epoch % 10 == 0:
            print(log.format(epoch, train_loss, best_val_perf, tmp_test_perf))

In [None]:
# z = model.encode(data.x, data.train_pos_edge_index)
# final_edge_index = model.decode_all(z)

In [None]:
for k in model.feature_vals.keys():
    print(model.feature_vals[k].shape)

In [None]:
# #from matplotlib.pyplot import plt
# plt.figure(figsize=(20,8))
# plt.plot(test_acc_list)
# plt.title("Accuracy Over Epochs", fontsize=25)
# plt.xlabel("Epochs", fontsize=25)
# plt.ylabel("Test Accuracy", fontsize=25)
# output_filename = f'CGCN_LP_test_accuracy.png'
# # Save the heatmap plot as an image
# plt.xticks(fontsize=25)
# plt.yticks(fontsize=25)
# plt.savefig(output_filename, bbox_inches='tight')
# #plt.show()
# plt.close()  # Close the plot to release resources