# Molecular set representation learning - Molecular property prediction

## Imports

In [1]:
from multiprocessing import cpu_count

import torch

import pandas as pd
import lightning.pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint

# from torch.utils.data import DataLoader
from torch_geometric.loader import DataLoader

from torch_geometric.datasets import LRGBDataset

from molsetrep.models import LightningSRGNNClassifier

## Prepare the data

### Get LRGB Data

In [2]:
dataset_train = LRGBDataset("./tmp", "Peptides-func", "train")
dataset_val = LRGBDataset("./tmp", "Peptides-func", "val")
dataset_test = LRGBDataset("./tmp", "Peptides-func", "test")

In [3]:
dims_graph = [
    dataset_train.num_node_features,
    dataset_train.num_edge_features,
]
dims_graph

[9, 3]

### Get torch data loaders

#### Set-enhanced GNN (SR-GNN)

In [4]:
train_loader = DataLoader(
    dataset_train,
    batch_size=64,
    shuffle=True,
    num_workers=cpu_count() if cpu_count() < 8 else 8,
    drop_last=True
)

val_loader = DataLoader(
    dataset_val,
    batch_size=64,
    shuffle=True,
    num_workers=cpu_count() if cpu_count() < 8 else 8,
    drop_last=True
)

test_loader = DataLoader(
    dataset_test,
    batch_size=64,
    shuffle=True,
    num_workers=cpu_count() if cpu_count() < 8 else 8,
    drop_last=True
)

## Train

### Initialise the model

#### Set-enhanced GNN (SR-GNN)

In [7]:
model_graph = LightningSRGNNClassifier(
    [128, 128], [64, 64],
    n_hidden_channels=[128, 64],
    n_in_channels=dims_graph[0], 
    n_edge_channels=dims_graph[1],
    n_layers=8,
    n_classes=10
)

ValueError: Unexpected keyword arguments: `task`

### Initialise the trainer and fit

#### Set-enhanced GNN (SR-GNN)

In [43]:
checkpoint_callback = ModelCheckpoint(dirpath="./tmp", save_top_k=1)

trainer_graph = pl.Trainer(
    max_epochs=10,
    callbacks=[checkpoint_callback],
    
)

trainer_graph.fit(model_graph, train_dataloaders=train_loader, val_dataloaders=val_loader)
trainer_graph.test(dataloaders=test_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name          | Type              | Params | Mode 
-------------------------------------------------------------
0  | gnn_regressor | SRGNNRegressor    | 1.5 M  | train
1  | train_r2      | R2Score           | 0      | train
2  | train_pearson | PearsonCorrCoef   | 0      | train
3  | train_rmse    | MeanSquaredError  | 0      | train
4  | train_mae     | MeanAbsoluteError | 0      | train
5  | val_r2        | R2Score           | 0      | train
6  | val_pearson   | PearsonCorrCoef   | 0      | train
7  | val_rmse      | MeanSquaredError  | 0      | train
8  | val_mae       | MeanAbsoluteError | 0      | train
9  | test_r2       | R2Score           | 0      | train
10 | test_pearson  | PearsonCorrCoef   | 0      | train
11 | test_rmse     | MeanSquaredError  | 0      | train
12 | test_mae      | MeanAbsoluteError | 0      | t

Sanity Checking DataLoader 0:   0%|                                                                                    | 0/2 [00:00<?, ?it/s]

  loss = F.mse_loss(out, y)


RuntimeError: The size of tensor a (64) must match the size of tensor b (10) at non-singleton dimension 1