In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterGrid
from sklearn.utils import shuffle

import torch
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.data import DataLoader

from torch_geometric.nn import MessagePassing, global_mean_pool
from torch_geometric.utils import add_self_loops, degree

from src.data_utils import load_dataset

# Scope of the notebook

In this notebook, we will use a Graph Convolutional Neural Network to solve the ESOL regression task.

At the beginning you need to take the molecular graph data and transform it to a valid format in order to fit the data to PyTorch geometric layers.

Then, you will have to implement a "vanilla" graph convolutional layer, define the network module that uses this layer and train it, using predefined hyperparameters.

At the end, you will have to make a random search of hyperparameters in order to obtain the best possible prediction score.

As the training loss and scoring function we will use mean squared error (MSE).

# Load the dataset

In [None]:
path = './data/'
target_name = 'ESOL'
batch_size = 32
task = 'regression'

train_dataset = load_dataset(path, target_name, 'train')
valid_dataset = load_dataset(path, target_name, 'val')
test_dataset = load_dataset(path, target_name, 'test')

# Create data loaders for pytorch geometric

Every molecule in our dataset is now represented by a 3-tuple which consists of:
 - Adjacency matrix $\in \mathbb{M}$(n_atoms, n_atoms)
 - Atom features matrix $\in \mathbb{M}$(n_atoms, n_features)
 - Label (ESOL value)
 
[PyTorch Geometric requires a different form of the data](https://pytorch-geometric.readthedocs.io/en/latest/modules/data.html#module-torch_geometric.data). 

Please write the function that transforms the data into the correct format:
 - Atom features matrix $\in \mathbb{M}$(n_atoms, n_features) - denoted by *x*
 - Label (ESOL value) - denoted by *y*
 - Edge indices matrix $\in \mathbb{M}$(2, n_edges) - denoted by *edge_index*

In [None]:
def transform_dataset_pg(dataset):
    # TODO
    pass

In [None]:
train_dataset = transform_dataset_pg(train_dataset)
valid_dataset = transform_dataset_pg(valid_dataset)
test_dataset = transform_dataset_pg(test_dataset)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

# Define the vanilla GCN layer

Please write a vanilla graph convolutional layer that inherits from [MessagePassing layer](https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.conv.message_passing.MessagePassing).

For every atom it should take all its neighbours and then:
1. Apply a linear layer on their feature vectors
2. Apply a ReLU nonlinearity
3. Aggregate information by taking the mean value of the feature vectors of all neighbours.

i.e. $x^{k}_{i} = \frac{1}{|N(i)|} \sum_{j \in N(i)} \text{ReLU}(Wx_{j} + b)$, where $N(i) = \{ j\ |\ (i, j) \in E \}$

Reminder: Make sure that self loops are included in the adjacency matrix.

In [None]:
class Vanilla_GC_Layer(MessagePassing):
    # TODO
    pass

# Create and train GC

Please define a class for the Graph Convolutional Network that uses our predefined `Vanilla_GC_Layer`. For the input graph, network should:
1. Pass it through some number of GC layers
2. Aggregate information from the whole molecule, by applying global mean pooling
3. Pass the graph embedding into the linear layer that returns the predicted value

In the class definition you should include the following network hyperparameters:
 - *layers_num* - number of Graph Convolutional layers
 - *model_dim* - dimensionality of the model inner representation
 - *input_dim* - dimensionality of the input atom representation
 - *output_dim*  - dimensionality of the output vector

In [None]:
class GraphConvNetwork(torch.nn.Module):
    # TODO
    pass

Define the following functions for training and validating our network:

1. Function **train** that trains the model for a given number of epochs. The function takes the *model* and *optimizer* as parameters. For every epoch step it runs the *run_epoch* function and then it calculates the MSE loss for the valid data.  

2. Function **run_epoch** that trains the model for a single epoch step.  The function takes the *model*, *optimizer* and *data_loader* as parameters. It should return the train loss for the given epoch.

3. Function **valid** that calculates the scoring function - mean squared error for the given dataset. The function takes the *model* and *data_loader* as parameters. It should return the calculated MSE score.

With such functions definitions, parts of the code could be re-used in the next sections of the notebook.

In [None]:
def train(model, optimizer):
    for epoch in range(epochs_num):
        epoch_train_loss = run_epoch(model, optimizer, train_loader)
        epoch_valid_loss = valid(model, valid_loader)
        print(f'Epoch: {epoch}, train loss: {epoch_train_loss}, valid loss: {epoch_valid_loss}')
    
    test_loss = valid(model, test_loader)
    print(f'End of training, test loss: {test_loss}')

        
def run_epoch(model, optimizer, data_loader):
    # TODO
    pass


def valid(model, data_loader):
    # TODO
    pass

In [None]:
# Set some default network hyperparameters
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

layers_num = 3
model_dim = 32
input_dim = train_dataset[0].x.shape[1]
output_dim = train_dataset[0].y.shape[0]

lr = 0.0001
epochs_num = 100

In [None]:
# Define the model and the optimizer
model = GraphConvNetwork(
            layers_num=layers_num, 
            model_dim=model_dim, 
            input_dim=input_dim, 
            output_dim=output_dim).to(device)
                   
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
# Train the network!
train(model, optimizer)

# Hyperparameter search to obtain better results

In [None]:
# The function that takes the given number of random samples 
# from a given hyperparameters distributions.
def make_params_grid(params, max_parameter_sets, randomize=True):
    to_list = lambda x: [x] if not isinstance(x, Iterable) else x
    params = {k: to_list(v) for k, v in params.items()}
    if randomize:
        grid = shuffle(ParameterGrid(params))
        return grid[:max_parameter_sets]
    return ParameterGrid(params)

In our experiment we will look for the best setting of the following hyperparameters:
 - learning rate
 - epochs_num
 - batch_size
 - layers_num
 - model_dim


You should define a function that searches for the best set of hyperparameters from a predefined distribution. 

1. The function **train_for_params** creates and trains the model for a single hyperparameters setting. The function takes *params* as parameters. It should:
    1. Define the data loaders (as they need the *batch_size* parameter that we just sampled).
    2. Define the network (as it needs the *layers_num* and *model_dim* parameters that we just sampled).
    3. Define the optimizer (as it needs the *learning_rate* parameter that we just sampled).
    4. Train the network with a given optimizer and data loaders, for a given number of epochs (*epochs_num* parameter that we just sampled) - here you can reuse the *run_epoch* function.
    5. Test the trained model on valid data (you can also calculate the MSE on test data there, to make your work easier).
    6. Return the valid and test loss for the given hyperparameters setting.

2. The function **grid_search** looks for the best hyperparameters setting for a given number of trials. The function takes *param_grid* and *max_parameter_sets* as arguments. For a single step, it should take the sampled hyperparameters, call the function *train_for_params* and save the validation score of the resulted model. At the end, it should take the best model selected by finding the lowest validation score and return the test score for the chosen model.

In [None]:
def train_for_params(params):
    # TODO
    pass


def grid_search(param_grid, max_parameter_sets):
    # TODO
    pass

In [None]:
# Define the params grid for our random search and set the maximum number of trials
param_grid = {
                'lr': [0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001],
                'epochs_num': [20, 50, 100, 200],
                'batch_size': [16, 32, 64, 128],
                'layers_num': [1, 2, 4, 6, 8],
                'model_dim': [16, 32, 64, 128, 256, 512],
             }

max_parameter_sets = 50

In [None]:
# Search for the best model!
grid_search(param_grid=param_grid, max_parameter_sets=50)

# Test other network settings

In order to to obtain the best possible score, you could experiment with other network settings, such as:
 - Adding dropout
 - Adding residual connections
 - Using different types of graph convolutional layer
 - Using different aggregation types
 - Adding more layer after the aggregation
 - Extending the grid, that we used for hyperparameters search