## MLNS Project Protein Protein Interaction

Currently I have downloaded the Protein and Enzyme data set
Following lab 4 of MLNS for Enzymes

In [1]:
! pip install dgl

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dgl
  Downloading dgl-1.0.1-cp39-cp39-manylinux1_x86_64.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dgl
Successfully installed dgl-1.0.1


In [2]:
# Import packages
import dgl
import torch
import torch.nn.functional as F
import numpy as np
from dgl.dataloading import GraphDataLoader
from dgl.nn import GraphConv
from IPython.display import Latex
from sklearn.model_selection import train_test_split

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [3]:
dataset_enzymes = dgl.data.TUDataset(name='ENZYMES')

# Add self loop to each graph
dataset_enzymes.graph_lists = [dgl.add_self_loop(graph) for graph in dataset_enzymes.graph_lists]

Downloading /root/.dgl/ENZYMES.zip from https://www.chrsmrrs.com/graphkerneldatasets/ENZYMES.zip...
Extracting file to /root/.dgl/ENZYMES


In [4]:
dataset_enzymes[0]

(Graph(num_nodes=37, num_edges=205,
       ndata_schemes={'node_labels': Scheme(shape=(1,), dtype=torch.int64), 'node_attr': Scheme(shape=(18,), dtype=torch.float64), '_ID': Scheme(shape=(), dtype=torch.int64)}
       edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}),
 tensor([5]))

In [5]:
print('Number of graph categories:', dataset_enzymes.num_labels)
print('Dimension of nodes features', dataset_enzymes[0][0].ndata['node_attr'].shape[1])

Number of graph categories: 6
Dimension of nodes features 18


In [6]:
# Split dataset into train, validation and test sets
train_sampler, val_sampler, test_sampler = dgl.data.utils.split_dataset(
        dataset_enzymes, frac_list=[0.6, 0.2, 0.2], shuffle=True)

In [7]:
# batch graphs with GraphDataLoader
train_dataloader = GraphDataLoader(
        train_sampler, batch_size=5, drop_last=False)
val_dataloader = GraphDataLoader(
    val_sampler, batch_size=5, drop_last=False)
test_dataloader = GraphDataLoader(
    test_sampler, batch_size=5, drop_last=False)

In [8]:
'''
class BasicGraphModel(torch.nn.Module):

    def __init__(self, n_layers, input_size, hidden_size, output_size):
        super(BasicGraphModel, self).__init__()

        # Define GNN components
        self.convs = torch.nn.ModuleList()
        self.convs.append(GraphConv(input_size, hidden_size))
        for i in range(n_layers-1):
            self.convs.append(GraphConv(hidden_size, hidden_size))
        self.linear = torch.nn.Linear(hidden_size, output_size)

    def forward(self, g, x):
        # Message Passing -- Learn node representations via GCN
        for conv in self.convs[:-1]:
            x = conv(g, x)
            x = F.elu(x)
        x = self.convs[-1](g, x)
        # Readout -- average all node representations to get graph embedding
        g.ndata['h'] = x
        x = dgl.mean_nodes(g, 'h')
        # Apply linear layer to classify graph representation
        x = self.linear(x)
        return x
'''
class BasicGraphModel(torch.nn.Module):

    def __init__(self, n_layers, input_size, hidden_size, output_size):
        super(BasicGraphModel, self).__init__()

        # Define GNN components
        self.convs = torch.nn.ModuleList()
        self.convs.append(GraphConv(input_size, hidden_size))
        for i in range(n_layers-1):
            self.convs.append(GraphConv(hidden_size, hidden_size))
        self.linear = torch.nn.Linear(hidden_size, output_size)

    def forward(self, g, x):
        # Message Passing -- Learn node representations via GCN
        for conv in self.convs[:-1]:
            x = conv(g, x)
            x = F.leaky_relu(x)
        x = self.convs[-1](g, x)
        # Readout -- average all node representations to get graph embedding
        g.ndata['h'] = x
        x = dgl.mean_nodes(g, 'h')
        # Apply linear layer to classify graph representation
        x = self.linear(x)
        return x


### 2.1 Training and evaluation

In [9]:
def train(model, loss_fcn, optimizer, train_dataloader, val_dataloader, num_epochs):
    model = model.double()
    model.train()

    for epoch in range(num_epochs):
        losses = []
        for batch, batched_graph in enumerate(train_dataloader):
            batched_graph, labels = batched_graph
            logits = model(batched_graph, batched_graph.ndata['node_attr'].double())
            loss = loss_fcn(logits, labels.T[0])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        loss_data = np.mean(losses)

        if epoch % 5 == 0:
            print("Epoch {} | Loss: {:.4f}".format(epoch, loss_data))
            test(model, loss_fcn, val_dataloader)

In [10]:
def test(model, loss_fcn, dataloader):
    scores = []
    for batch, batched_graph in enumerate(dataloader):
        batched_graph, labels = batched_graph
        scores.append(
            evaluate(model, batched_graph, labels, loss_fcn))
    mean_scores = np.mean(scores)
    print("Accuracy score: {:.4f}".format(mean_scores))

In [11]:
def evaluate(model, batched_graph, labels, loss_fcn):
    model = model.double()
    model.eval()
    with torch.no_grad():
        output = model(batched_graph, batched_graph.ndata['node_attr'].double())

    labels = labels.T[0]
    loss = loss_fcn(output, labels)
    predict = output.argmax(dim=1)
    score = (labels == predict).sum().item() / len(labels)

    return score

In [12]:
# Store features
n_features, n_classes = dataset_enzymes[0][0].ndata['node_attr'].shape[1], \
    dataset_enzymes.num_labels
hidden_size = 128

# Define model, loss function and optimizer
model = BasicGraphModel(n_layers=3, input_size=n_features,
                        hidden_size=hidden_size, output_size=n_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fcn = torch.nn.CrossEntropyLoss()

# Train and test
train(model, loss_fcn, optimizer,
        train_dataloader, val_dataloader, num_epochs=150)
test(model, loss_fcn, test_dataloader)


  assert input.numel() == input.storage().size(), (


Epoch 0 | Loss: 2.1919
Accuracy score: 0.2250
Epoch 5 | Loss: 1.7149
Accuracy score: 0.2000
Epoch 10 | Loss: 1.6954
Accuracy score: 0.2000
Epoch 15 | Loss: 1.6791
Accuracy score: 0.2083
Epoch 20 | Loss: 1.6469
Accuracy score: 0.2417
Epoch 25 | Loss: 1.5984
Accuracy score: 0.2750
Epoch 30 | Loss: 1.5538
Accuracy score: 0.2667
Epoch 35 | Loss: 1.5222
Accuracy score: 0.3333
Epoch 40 | Loss: 1.4760
Accuracy score: 0.3417
Epoch 45 | Loss: 1.4204
Accuracy score: 0.3583
Epoch 50 | Loss: 1.3656
Accuracy score: 0.3833
Epoch 55 | Loss: 1.3074
Accuracy score: 0.3667
Epoch 60 | Loss: 1.2531
Accuracy score: 0.3583
Epoch 65 | Loss: 1.1992
Accuracy score: 0.3583
Epoch 70 | Loss: 1.1548
Accuracy score: 0.3167
Epoch 75 | Loss: 1.1191
Accuracy score: 0.3167
Epoch 80 | Loss: 1.0748
Accuracy score: 0.3750
Epoch 85 | Loss: 0.9889
Accuracy score: 0.3833
Epoch 90 | Loss: 0.9679
Accuracy score: 0.3833
Epoch 95 | Loss: 0.9234
Accuracy score: 0.3833
Epoch 100 | Loss: 0.8408
Accuracy score: 0.4167
Epoch 105 | Lo

## Doing everything with Proteins

In [None]:
dataset_proteins = dgl.data.TUDataset(name='PROTEINS')

# Add self loop to each graph
dataset_proteins.graph_lists = [dgl.add_self_loop(graph) for graph in dataset_proteins.graph_lists]

Downloading /root/.dgl/PROTEINS.zip from https://www.chrsmrrs.com/graphkerneldatasets/PROTEINS.zip...
Extracting file to /root/.dgl/PROTEINS


In [None]:
dataset_proteins[0]

(Graph(num_nodes=42, num_edges=204,
       ndata_schemes={'node_labels': Scheme(shape=(1,), dtype=torch.int64), 'node_attr': Scheme(shape=(1,), dtype=torch.float64), '_ID': Scheme(shape=(), dtype=torch.int64)}
       edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}),
 tensor([0]))

In [None]:
print('Number of graph categories:', dataset_proteins.num_labels)
print('Dimension of nodes features', dataset_proteins[0][0].ndata['node_attr'].shape[1])

Number of graph categories: 2
Dimension of nodes features 1


In [None]:
# Split dataset into train, validation and test sets
train_sampler, val_sampler, test_sampler = dgl.data.utils.split_dataset(
        dataset_proteins, frac_list=[0.6, 0.2, 0.2], shuffle=True)

In [None]:
# batch graphs with GraphDataLoader
train_dataloader = GraphDataLoader(
        train_sampler, batch_size=5, drop_last=False)
val_dataloader = GraphDataLoader(
    val_sampler, batch_size=5, drop_last=False)
test_dataloader = GraphDataLoader(
    test_sampler, batch_size=5, drop_last=False)

In [None]:
# Store features
n_features, n_classes = dataset_proteins[0][0].ndata['node_attr'].shape[1], \
    dataset_proteins.num_labels
hidden_size = 64

# Define model, loss function and optimizer
model = BasicGraphModel(n_layers=5, input_size=n_features,
                        hidden_size=hidden_size, output_size=n_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fcn = torch.nn.CrossEntropyLoss()

# Train and test
train(model, loss_fcn, optimizer,
        train_dataloader, val_dataloader, num_epochs=150)
test(model, loss_fcn, test_dataloader)


Epoch 0 | Loss: 0.6883
Accuracy score: 0.5844
Epoch 5 | Loss: 0.6759
Accuracy score: 0.5889
Epoch 10 | Loss: 0.6718
Accuracy score: 0.6156
Epoch 15 | Loss: 0.6620
Accuracy score: 0.6267
Epoch 20 | Loss: 0.6631
Accuracy score: 0.6222
Epoch 25 | Loss: 0.6539
Accuracy score: 0.6311
Epoch 30 | Loss: 0.6514
Accuracy score: 0.6267
Epoch 35 | Loss: 0.6499
Accuracy score: 0.6133
Epoch 40 | Loss: 0.6490
Accuracy score: 0.6133
Epoch 45 | Loss: 0.6515
Accuracy score: 0.6178
Epoch 50 | Loss: 0.6480
Accuracy score: 0.6133
Epoch 55 | Loss: 0.6475
Accuracy score: 0.6133
Epoch 60 | Loss: 0.6472
Accuracy score: 0.6178
Epoch 65 | Loss: 0.6456
Accuracy score: 0.6089
Epoch 70 | Loss: 0.6450
Accuracy score: 0.6133
Epoch 75 | Loss: 0.6446
Accuracy score: 0.6044
Epoch 80 | Loss: 0.6448
Accuracy score: 0.6089
Epoch 85 | Loss: 0.6447
Accuracy score: 0.6089
Epoch 90 | Loss: 0.6441
Accuracy score: 0.6044
Epoch 95 | Loss: 0.6441
Accuracy score: 0.6044
Epoch 100 | Loss: 0.6444
Accuracy score: 0.6089
Epoch 105 | Lo

## Trying state of the art model GCN based on github paper using a pooling layer


In [None]:
! pip install torch_geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch_geometric
  Downloading torch_geometric-2.3.0.tar.gz (616 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m616.2/616.2 KB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: torch_geometric
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone
  Created wheel for torch_geometric: filename=torch_geometric-2.3.0-py3-none-any.whl size=909897 sha256=a9c8ffed22212cad75d93c32e30e2720a0f4b75a431dccbaa7f9b1ab41c3e920
  Stored in directory: /root/.cache/pip/wheels/cd/7d/6b/17150450b80b4a3656a84330e22709ccd8dc0f8f4773ba4133
Successfully built torch_geometric
Installing collected packages: torch_geometric
Successfully installed torch_geomet

In [None]:
! pip install layers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
! pip install torch_scatter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch_scatter
  Downloading torch_scatter-2.1.1.tar.gz (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.6/107.6 KB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torch_scatter
  Building wheel for torch_scatter (setup.py) ... [?25l[?25hdone
  Created wheel for torch_scatter: filename=torch_scatter-2.1.1-cp39-cp39-linux_x86_64.whl size=492132 sha256=934c89a2f79c2ad245039aba1618d4ea49ce7f14455fcf5b361970816d800799
  Stored in directory: /root/.cache/pip/wheels/d5/0c/18/11b4cf31446c5d460543b0fff930fcac3a3f8a785e5c73fb15
Successfully built torch_scatter
Installing collected packages: torch_scatter
Successfully installed torch_scatter-2.1.1


In [None]:
! pip install torch_sparse

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch_sparse
  Downloading torch_sparse-0.6.17.tar.gz (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.2/209.2 KB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torch_sparse
  Building wheel for torch_sparse (setup.py) ... [?25l[?25hdone
  Created wheel for torch_sparse: filename=torch_sparse-0.6.17-cp39-cp39-linux_x86_64.whl size=1082944 sha256=0db7882b5c3ef1beb7f76cd43c167a7c1e636bfd3baf7874556d94769272d279
  Stored in directory: /root/.cache/pip/wheels/f8/43/54/bcb8acdd1109bd1e4c71106747af298d0315cdf3f090b2ae43
Successfully built torch_sparse
Installing collected packages: torch_sparse
Successfully installed torch_sparse-0.6.17


In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
from torch_geometric.nn import GCNConv

from layers import GCN, HGPSLPool


class Model(torch.nn.Module):
    def __init__(self, args):
        super(Model, self).__init__()
        self.args = args
        self.num_features = args.num_features
        self.nhid = args.nhid
        self.num_classes = args.num_classes
        self.pooling_ratio = args.pooling_ratio
        self.dropout_ratio = args.dropout_ratio
        self.sample = args.sample_neighbor
        self.sparse = args.sparse_attention
        self.sl = args.structure_learning
        self.lamb = args.lamb

        self.conv1 = GCNConv(self.num_features, self.nhid)
        self.conv2 = GCN(self.nhid, self.nhid)
        self.conv3 = GCN(self.nhid, self.nhid)

        self.pool1 = HGPSLPool(self.nhid, self.pooling_ratio, self.sample, self.sparse, self.sl, self.lamb)
        self.pool2 = HGPSLPool(self.nhid, self.pooling_ratio, self.sample, self.sparse, self.sl, self.lamb)

        self.lin1 = torch.nn.Linear(self.nhid * 2, self.nhid)
        self.lin2 = torch.nn.Linear(self.nhid, self.nhid // 2)
        self.lin3 = torch.nn.Linear(self.nhid // 2, self.num_classes)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        edge_attr = None

        x = F.relu(self.conv1(x, edge_index, edge_attr))
        x, edge_index, edge_attr, batch = self.pool1(x, edge_index, edge_attr, batch)
        x1 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(self.conv2(x, edge_index, edge_attr))
        x, edge_index, edge_attr, batch = self.pool2(x, edge_index, edge_attr, batch)
        x2 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(self.conv3(x, edge_index, edge_attr))
        x3 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(x1) + F.relu(x2) + F.relu(x3)

        x = F.relu(self.lin1(x))
        x = F.dropout(x, p=self.dropout_ratio, training=self.training)
        x = F.relu(self.lin2(x))
        x = F.dropout(x, p=self.dropout_ratio, training=self.training)
        x = F.log_softmax(self.lin3(x), dim=-1)

        return x

ModuleNotFoundError: ignored

In [None]:
!pip install torch-geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch-geometric
  Downloading torch_geometric-2.3.0.tar.gz (616 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/616.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m256.0/616.2 kB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m616.2/616.2 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: torch-geometric
  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone
  Created wheel for torch-geometric: filename=torch_geometric-2.3.0-py3-none-any.whl size=909897 sha256=386da170ff7a0956ae05d56d3f717ea0a92

In [None]:
dataset_proteins[0]

(Graph(num_nodes=42, num_edges=204,
       ndata_schemes={'node_labels': Scheme(shape=(1,), dtype=torch.int64), 'node_attr': Scheme(shape=(1,), dtype=torch.float64), '_ID': Scheme(shape=(), dtype=torch.int64)}
       edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}),
 tensor([0]))

## Generating Protein Sequences using GAT

In [None]:
import torch
import torch.nn.functional as F
#from torch_geometric.datasets import TUDataset
#from torch_geometric.data import DataLoader
from torch_geometric.nn import GATConv
import random

class GAT(torch.nn.Module):
    def __init__(self, num_node_features, hidden_channels, num_classes):
        super(GAT, self).__init__()
        self.conv1 = GATConv(num_node_features, hidden_channels)
        self.conv2 = GATConv(hidden_channels, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Load the TUDataset
path = 'path/to/your/data'
#dataset = TUDataset(path, name='Your_Dataset_Name')
dataset = dataset_proteins
# Split the dataset into train, val, and test sets
# Shuffle the dataset
train_sampler, val_sampler, test_sampler = dgl.data.utils.split_dataset(
        dataset, frac_list=[0.6, 0.2, 0.2], shuffle=True)
# Create DataLoaders for each dataset
train_loader = GraphDataLoader(train_sampler, batch_size=32, shuffle=True)
val_loader = GraphDataLoader(val_sampler, batch_size=32, shuffle=False)
test_loader = GraphDataLoader(test_sampler, batch_size=32, shuffle=False)

# Instantiate the GAT model, loss function, and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
graph, _ = dataset[0]
num_node_features = graph.ndata['node_attr'].shape[1]
model = GAT(num_node_features=num_node_features, hidden_channels=64, num_classes=2).to(device)
loss_fn = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# Training function
def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = loss_fn(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Evaluation function
def evaluate(loader):
    model.eval()
    correct = 0
    for data in loader:
        data = data.to(device)
        with torch.no_grad():
            out = model(data)
        pred = out.argmax(dim=1)
        correct += (pred == data.y).sum().item()
    return correct / len(loader.dataset)

# Training loop
num_epochs = 100
for epoch in range(1, num_epochs + 1):
    train_loss = train()
    val_acc = evaluate(val_loader)
    print(f'Epoch: {epoch}, Train Loss: {train_loss:.4f}, Val Acc: {val_acc:.4f}')

# Test the model
test_acc = evaluate(test_loader)
print(f'Test Acc: {test_acc:.4f}')


AttributeError: ignored

In [None]:
import torch
import torch.nn.functional as F
import dgl.data
import dgl
from dgl.nn import GATConv
from dgl.dataloading import GraphDataLoader
from torch_geometric.nn import global_add_pool

from torch_geometric.nn import global_add_pool

class GAT(torch.nn.Module):
    def __init__(self, num_node_features, hidden_channels, num_classes):
        super(GAT, self).__init__()
        self.conv1 = GATConv(num_node_features, hidden_channels, num_heads=1)
        self.conv2 = GATConv(hidden_channels, num_classes, num_heads=1)

    def forward(self, g, x):
      x = self.conv1(g, x).squeeze(-1)
      x = F.relu(x)
      x = F.dropout(x, p=0.5, training=self.training)
      x = self.conv2(g, x).squeeze(-1)
      x = x.view(x.shape[0], -1, x.shape[-1])  # Reshape the tensor to (batch_size, num_nodes, num_classes)
      x = x.sum(dim=1)  # Sum the node features for each graph in the batch

      # Divide the summed node features by the number of nodes in each graph
      batch_num_nodes = g.batch_num_nodes().float().to(x.device)
      x = x / batch_num_nodes.view(-1, 1)

      return F.log_softmax(x, dim=1)



# Load the TUDataset
# dataset = dgl.data.TUDataset(name='Your_Dataset_Name')
dataset = dataset_proteins

# Split the dataset into train, val, and test sets
train_sampler, val_sampler, test_sampler = dgl.data.utils.split_dataset(
        dataset, frac_list=[0.6, 0.2, 0.2], shuffle=True)

# Create DataLoaders for each dataset
train_loader = GraphDataLoader(train_sampler, batch_size=32, shuffle=True)
val_loader = GraphDataLoader(val_sampler, batch_size=32, shuffle=False)
test_loader = GraphDataLoader(test_sampler, batch_size=32, shuffle=False)

# Instantiate the GAT model, loss function, and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
graph, _ = dataset[0]
num_node_features = graph.ndata['node_attr'].shape[1]
model = GAT(num_node_features=num_node_features, hidden_channels=64, num_classes=2).to(device)
loss_fn = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# Training function
# Training function
def train():
    model.train()
    total_loss = 0
    for batched_graph, labels in train_loader:
        batched_graph = batched_graph.to(device)
        labels = labels.to(device).view(-1) # Reshape the labels
        x = batched_graph.ndata['node_attr'].float()
        optimizer.zero_grad()
        out = model(batched_graph, x)
        loss = loss_fn(out, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)


# Evaluation function
def evaluate(loader):
    model.eval()
    correct = 0
    for batched_graph, labels in loader:
        batched_graph = batched_graph.to(device)
        labels = labels.to(device).view(-1) # Reshape the labels
        x = batched_graph.ndata['node_attr'].float()
        with torch.no_grad():
            out = model(batched_graph, x)
        pred = out.argmax(dim=1)
        correct += (pred == labels).sum().item()
    return correct / len(loader.dataset)


# Training loop
num_epochs = 100
for epoch in range(1, num_epochs + 1):
    train_loss = train()
    val_acc = evaluate(val_loader)
    print(f'Epoch: {epoch}, Train Loss: {train_loss:.4f}, Val Acc: {val_acc:.4f}')

# Test the model
test_acc = evaluate(test_loader)
print(f'Test Acc: {test_acc:.4f}')


RuntimeError: ignored

## GAE Implementation

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import dgl
from dgl.nn import GraphConv
from sklearn.model_selection import train_test_split
import numpy as np
from dgl.data import TUDataset

# Define GCN layer
class GCNLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GCNLayer, self).__init__()
        self.gcn = GraphConv(in_feats, out_feats)

    def forward(self, g, h):
        return self.gcn(g, h)

# Build the GAE encoder model
class Encoder(nn.Module):
    def __init__(self, in_feats, hidden_feats, out_feats):
        super(Encoder, self).__init__()
        self.gcn1 = GCNLayer(in_feats, hidden_feats)
        self.gcn2 = GCNLayer(hidden_feats, out_feats)
        self.relu = nn.ReLU()

    def forward(self, g, h):
        h = self.gcn1(g, h)
        h = self.relu(h)
        h = self.gcn2(g, h)
        return h

# Build the GAE decoder model
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()

    def forward(self, z):
        z_transpose = z.t()
        adj_pred = torch.sigmoid(torch.matmul(z, z_transpose))
        return adj_pred

# Define the training and evaluation functions
# Define the training and evaluation functions
def train(encoder, decoder, g, features, adj_orig, optimizer, criterion):
    encoder.train()
    decoder.train()
    optimizer.zero_grad()

    z = encoder(g, features)
    adj_pred = decoder(z)

    loss = criterion(adj_pred, adj_orig)
    loss.backward()
    optimizer.step()

    return loss.item()

def evaluate(encoder, decoder, g, features, adj_orig, criterion):
    encoder.eval()
    decoder.eval()

    with torch.no_grad():
        z = encoder(g, features)
        adj_pred = decoder(z)
        loss = criterion(adj_pred, adj_orig)

    return loss.item()

# Prepare the dataset and device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset_proteins = TUDataset(name='PROTEINS')
dataset_proteins.graph_lists = [dgl.add_self_loop(graph) for graph in dataset_proteins.graph_lists]

# Set parameters
graph, _ = dataset_proteins[0]
num_node_features = graph.ndata['node_attr'].shape[1]
in_feats = num_node_features
hidden_feats = 64
out_feats = 32
lr = 0.01
epochs = 100

# Prepare the dataset
train_dataset, test_dataset = train_test_split(dataset_proteins.graph_lists, test_size=0.1, random_state=42)
train_graphs = [graph.to(device) for graph in train_dataset]
test_graphs = [graph.to(device) for graph in test_dataset]

# Initialize the models and optimizer
encoder = Encoder(in_feats, hidden_feats, out_feats).to(device)
decoder = Decoder().to(device)
model = nn.Sequential(encoder, decoder).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss()

# Train and evaluate the GAE
for epoch in range(epochs):
    train_losses = []
    for g in train_graphs:
        features = g.ndata['node_attr'].float().to(device)
        adj_orig = torch.Tensor(g.adjacency_matrix().to_dense()).to(device)
        train_loss = train(encoder, decoder, g, features, adj_orig, optimizer, criterion)
        train_losses.append(train_loss)

    train_loss_mean = np.mean(train_losses)

    test_losses = []
    for g in test_graphs:
        features = g.ndata['node_attr'].float().to(device)
        adj_orig = torch.Tensor(g.adjacency_matrix().to_dense()).to(device)
        test_loss = evaluate(encoder, decoder, g, features, adj_orig, criterion)
        test_losses.append(test_loss)

    test_loss_mean = np.mean(test_losses)

    print(f'Epoch: {epoch + 1}, Train Loss: {train_loss_mean:.4f}, Test Loss: {test_loss_mean:.4f}')


Epoch: 1, Train Loss: 0.7906, Test Loss: 0.7596
Epoch: 2, Train Loss: 0.7302, Test Loss: 0.7337
Epoch: 3, Train Loss: 0.7474, Test Loss: 0.7289
Epoch: 4, Train Loss: 0.7353, Test Loss: 0.7437
Epoch: 5, Train Loss: 0.7246, Test Loss: 0.7656
Epoch: 6, Train Loss: 0.7494, Test Loss: 0.7335
Epoch: 7, Train Loss: 0.7349, Test Loss: 0.7270
Epoch: 8, Train Loss: 0.7084, Test Loss: 0.7264
Epoch: 9, Train Loss: 0.7103, Test Loss: 0.7273
Epoch: 10, Train Loss: 0.7105, Test Loss: 0.7288
Epoch: 11, Train Loss: 0.7085, Test Loss: 0.7375
Epoch: 12, Train Loss: 0.7048, Test Loss: 0.7695
Epoch: 13, Train Loss: 0.7046, Test Loss: 0.8531
Epoch: 14, Train Loss: 0.7150, Test Loss: 0.9180
Epoch: 15, Train Loss: 0.7085, Test Loss: 0.9153
Epoch: 16, Train Loss: 0.7092, Test Loss: 0.8933
Epoch: 17, Train Loss: 0.7085, Test Loss: 0.8942
Epoch: 18, Train Loss: 0.7085, Test Loss: 0.8869
Epoch: 19, Train Loss: 0.7081, Test Loss: 0.9018
Epoch: 20, Train Loss: 0.7085, Test Loss: 0.8500
Epoch: 21, Train Loss: 0.7066