# Molecular set representation learning - Molecular property prediction

## Imports

In [1]:
from multiprocessing import cpu_count

import torch

import pandas as pd
import lightning.pytorch as pl

from torch.utils.data import DataLoader

from molsetrep.models import (
    LightningSRRegressor,
    LightningDualSRRegressor,
    LightningSRGNNRegressor,
)
from molsetrep.encoders import SingleSetEncoder, DualSetEncoder, GraphEncoder

## Prepare the data

### Load from file

In [2]:
df_train = pd.read_csv("../data/adme/ADME_MDR1_ER_train.csv")
df_test = pd.read_csv("../data/adme/ADME_MDR1_ER_test.csv")

### Encode the data

#### Single-set (MSR1)

In [3]:
enc_single = SingleSetEncoder()
dataset_train_single = enc_single.encode(df_train["smiles"], df_train["activity"], torch.float32)
dataset_test_single = enc_single.encode(df_test["smiles"], df_test["activity"], torch.float32)

# Get the dimensions of the encoding
dims_single = [dataset_train_single[0][0].shape[0]]

#### Dual-set (MSR2)

In [4]:
enc_dual = DualSetEncoder()
dataset_train_dual = enc_dual.encode(df_train["smiles"], df_train["activity"], torch.float32)
dataset_test_dual = enc_dual.encode(df_test["smiles"], df_test["activity"], torch.float32)

# Get the dimensions of the encoding
dims_dual = [
    len(dataset_train_dual[0][0][0]),
    len(dataset_train_dual[0][1][0])
]

#### Set-enhanced GNN (SR-GNN)

In [5]:
enc_graph = GraphEncoder()
dataset_train_graph = enc_graph.encode(df_train["smiles"], df_train["activity"], label_dtype=torch.float32)
dataset_test_graph = enc_graph.encode(df_test["smiles"], df_test["activity"], label_dtype=torch.float32)

# Get the dimensions of the encoding
dims_graph = [
    dataset_train_graph.dataset[0].num_node_features,
    dataset_train_graph.dataset[0].num_edge_features,
]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 2113/2113 [00:06<00:00, 309.57it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 529/529 [00:01<00:00, 308.58it/s]


### Get torch data loaders

#### Single-set (MSR1)

In [6]:
train_loader_single = DataLoader(
    dataset_train_single,
    batch_size=64,
    shuffle=True,
    num_workers=cpu_count() if cpu_count() < 8 else 8,
    drop_last=True
)

test_loader_single = DataLoader(
    dataset_test_single,
    batch_size=64,
    shuffle=True,
    num_workers=cpu_count() if cpu_count() < 8 else 8,
    drop_last=True
)

#### Dual-set (MSR2)

In [7]:
train_loader_dual = DataLoader(
    dataset_train_dual,
    batch_size=64,
    shuffle=True,
    num_workers=cpu_count() if cpu_count() < 8 else 8,
    drop_last=True
)

test_loader_dual = DataLoader(
    dataset_test_dual,
    batch_size=64,
    shuffle=True,
    num_workers=cpu_count() if cpu_count() < 8 else 8,
    drop_last=True
)

#### Set-enhanced GNN (SR-GNN)

In [8]:
train_loader_graph = dataset_train_graph
test_loader_graph = dataset_test_graph

## Train

### Initialise the model

#### Single-set (MSR1)

In [9]:
model_single = LightningSRRegressor([64], [8], dims_single)

#### Dual-set (MSR2)

In [10]:
model_dual = LightningDualSRRegressor([64, 64], [8, 8], dims_dual)

#### Set-enhanced GNN (SR-GNN)

In [11]:
model_graph = LightningSRGNNRegressor(
    [128, 128], [64, 64],
    n_hidden_channels=[128, 64],
    n_in_channels=dims_graph[0], 
    n_edge_channels=dims_graph[1],
    n_layers=8
)

### Initialise the trainer and fit

#### Single-set (MSR1)

In [12]:
trainer_single = pl.Trainer(
    max_epochs=250,
)

trainer_single.fit(model_dual, train_dataloaders=train_loader_single)
trainer_single.test(dataloaders=test_loader_single)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/daenu/micromamba/envs/molsetrep/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
/home/daenu/micromamba/envs/molsetrep/lib/python3.10/site-packages/lightning/pytorch/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
You are using a CUDA device ('NVIDIA GeForce RTX 4070 Ti') that has Tensor Cores. To properly utilize them, you should set `torc

Epoch 0:   0%|                                                                                                        | 0/33 [00:00<?, ?it/s]

ValueError: not enough values to unpack (expected 3, got 2)

#### Dual-set (MSR2)

In [13]:
trainer_dual = pl.Trainer(
    max_epochs=250,
)

trainer_dual.fit(model_dual, train_dataloaders=train_loader_dual)
trainer_dual.test(dataloaders=test_loader_dual)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name          | Type              | Params | Mode 
-------------------------------------------------------------
0  | sr_regressor  | DualSRRegressor   | 191 K  | train
1  | train_r2      | R2Score           | 0      | train
2  | train_pearson | PearsonCorrCoef   | 0      | train
3  | train_rmse    | MeanSquaredError  | 0      | train
4  | train_mae     | MeanAbsoluteError | 0      | train
5  | val_r2        | R2Score           | 0      | train
6  | val_pearson   | PearsonCorrCoef   | 0      | train
7  | val_rmse      | MeanSquaredError  | 0      | train
8  | val_mae       | MeanAbsoluteError | 0      | train
9  | test_r2       | R2Score           | 0      | train
10 | test_pearson  | PearsonCorrCoef   | 0      | train
11 | test_rmse     | MeanSquaredError  | 0      | train
12 | test_mae      | MeanAbsoluteError | 0      | t

Epoch 76:   0%|                                                                                             | 0/33 [00:00<?, ?it/s, v_num=10]

Exception ignored in: <function _releaseLock at 0x7f37a295d120>
Traceback (most recent call last):
  File "/home/daenu/micromamba/envs/molsetrep/lib/python3.10/logging/__init__.py", line 228, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 


RuntimeError: DataLoader worker (pid(s) 43375, 43400, 43424, 43448, 43472) exited unexpectedly

#### Set-enhanced GNN (SR-GNN)

In [14]:
trainer_graph = pl.Trainer(
    max_epochs=900,
)

trainer_graph.fit(model_graph, train_dataloaders=train_loader_graph)
trainer_graph.test(dataloaders=test_loader_graph)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name          | Type              | Params | Mode 
-------------------------------------------------------------
0  | gnn_regressor | SRGNNRegressor    | 1.5 M  | train
1  | train_r2      | R2Score           | 0      | train
2  | train_pearson | PearsonCorrCoef   | 0      | train
3  | train_rmse    | MeanSquaredError  | 0      | train
4  | train_mae     | MeanAbsoluteError | 0      | train
5  | val_r2        | R2Score           | 0      | train
6  | val_pearson   | PearsonCorrCoef   | 0      | train
7  | val_rmse      | MeanSquaredError  | 0      | train
8  | val_mae       | MeanAbsoluteError | 0      | train
9  | test_r2       | R2Score           | 0      | train
10 | test_pearson  | PearsonCorrCoef   | 0      | train
11 | test_rmse     | MeanSquaredError  | 0      | train
12 | test_mae      | MeanAbsoluteError | 0      | t

Epoch 46:   0%|                                                                                             | 0/33 [00:00<?, ?it/s, v_num=11]

Exception ignored in: <function _releaseLock at 0x7f37a295d120>
Traceback (most recent call last):
  File "/home/daenu/micromamba/envs/molsetrep/lib/python3.10/logging/__init__.py", line 228, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 


RuntimeError: DataLoader worker (pid(s) 53566, 53590, 53614, 53638, 53662) exited unexpectedly