# Homework 04 Graph Convolutional Neural Networks


In [7]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch.nn import Linear, ReLU, Sequential
from torch_geometric.data import Dataset
from torch_geometric.loader import DataLoader

## Part I. The training and testing dataset are provided

- The train and test datasets were pre-processed graphs. The train dataset contains 20,000 graphs, while the test dataset contains 2,000 graphs.
- Each graph contains the following components:

  - `x`, the matrix containing node features, `[num_of_nodes, num_node_features=11]`
  - `edge_index`, the matrix containing connection information about different nodes, `[2, num_of_edges]`
  - `y`, the label for the graph, `scaler`. The value is set to `0` in the test dataset
  - `pos`, the matrix containing the node positions, `[num_of_nodes, 3]`
  - `edge_attr`, the matrix containing the edge information, `[num_edges, 4]`
  - `names`, index for the graph. For example, `gdb_59377`

- Depending on the graph convolutional layer that is used, different components are needed. For the most basic application, `x`, `edge_index` and `y` will be used.


In [8]:
class QM_Dataset(Dataset):
    def __init__(self, path):
        super().__init__(root=".")
        self.data = torch.load(path, weights_only=False)

    def len(self):
        return len(self.data)

    def get(self, idx):
        return self.data[idx]


train_path = "data/train.pt"
test_path = "data/test.pt"

train_data_ = QM_Dataset(train_path)

# train dataset can be split for validation purposes
train_data, validate_data = torch.utils.data.random_split(train_data_, [
                                                          19000, 1000])
test_data = QM_Dataset(test_path)

## Part II. Example solution


In [9]:
# define the network
# many convolutional layers are available in torch_geometric.nn
# here NNConv is just used as an example

from torch_geometric.nn import NNConv, Set2Set


class Net(torch.nn.Module):
    def __init__(self, num_features=11, dim=64):
        super().__init__()
        self.lin0 = torch.nn.Linear(num_features, dim)
        nn = Sequential(Linear(4, 128), ReLU(), Linear(128, dim * dim))
        # replace with your own convolutional layers here
        self.conv = NNConv(dim, dim, nn, aggr='mean')
        # set2set is used to map from nodes to graphs
        self.set2set = Set2Set(dim, processing_steps=3)
        self.lin1 = torch.nn.Linear(2 * dim, dim)
        self.lin2 = torch.nn.Linear(dim, 1)

    def forward(self, data):
        # data.x size [batch_num_nodes, num_node_features]
        out = F.relu(self.lin0(data.x))
        for _ in range(3):
            out = F.relu(self.conv(out, data.edge_index, data.edge_attr))
        # [batch_num_nodes, dim] ==> [batch_num_graphs, dim*2]
        out = self.set2set(out, data.batch)
        out = F.relu(self.lin1(out))
        out = self.lin2(out)
        return out.view(-1)

In [10]:
# load the datasets
train_loader = DataLoader(train_data, batch_size=128)
validate_loader = DataLoader(validate_data, batch_size=128)
test_loader = DataLoader(test_data, batch_size=8)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.00005)

In [13]:
criterion = torch.nn.L1Loss()

# define training and evaluation functions


def train(loader):
    """Takes in training dataset loader,
    train the model one step,
    update the parameters,
    return the current loss"""

    model.train()
    for data in loader:
        # move to device
        data = data.to(device)
        # train model single forward pass
        out = model(data)
        # calculate loss
        loss = criterion(out, data.y)
        # gradients
        loss.backward()
        # optimizer step
        optimizer.step()
        optimizer.zero_grad()

    return loss


def eval(loader, model):
    """Takes the validation dataset loader,
    return the validation MAE"""

    model.eval()
    for data in loader:
        # move to device also
        data = data.to(device)
        out = model(data)
        loss = criterion(out, data.y)
        return loss

In [None]:
# training
num_epochs = 5
for epoch in range(1, num_epochs):
    """Calculate loss and
    validation MAE"""
    train(train_loader)

    train_loss = eval(train_loader, model=model)
    val_loss = eval(validate_loader, model=model)
    print(
        f"epoch: {epoch} -- train_loss: {train_loss} -- val_loss: {val_loss}")

epoch: $1 -- train_loss: $0.9774715900421143 -- val_loss: $0.8649500608444214
epoch: $2 -- train_loss: $0.9523863792419434 -- val_loss: $0.8469842672348022
epoch: $3 -- train_loss: $0.9337181448936462 -- val_loss: $0.8378925323486328
epoch: $4 -- train_loss: $0.9118258357048035 -- val_loss: $0.8226449489593506


In [18]:
# predict
model.eval()
y_pred = []
Idx = []

with torch.no_grad():
    for data in test_loader:
        """Predict and save graph index and
        predicted y value"""

        # send data to device
        data = data.to(device)

        # get preds
        pred = model(data)

        # move pred back to cpu for numpy/storage
        pred = pred.detach().cpu().numpy()

        # add pred to y_pred
        y_pred.extend(pred)

        # get graph indecies
        Idx.extend([data_item.name for data_item in data.to_data_list()])

assert (len(Idx) == len(y_pred))

df = pd.DataFrame({"Idx": Idx, "labels": y_pred})

In [19]:
# upload solution
df.columns = ['Idx', 'labels']
df.to_csv("data/submission1.csv", index=False)