# Molecular set representation learning - Molecular property prediction

## Imports

In [None]:
from multiprocessing import cpu_count

import torch

import pandas as pd
import lightning.pytorch as pl

from torch.utils.data import DataLoader

from molsetrep.models import (
    LightningSRRegressor,
    LightningDualSRRegressor,
    LightningSRGNNRegressor,
)
from molsetrep.encoders import SingleSetEncoder, DualSetEncoder, GraphEncoder

## Prepare the data

### Load from file

In [2]:
df_train = pd.read_csv("../data/adme/ADME_MDR1_ER_train.csv")
df_test = pd.read_csv("../data/adme/ADME_MDR1_ER_test.csv")

### Encode the data

#### Single-set (MSR1)

In [3]:
enc_single = SingleSetEncoder()
dataset_train_single = enc_single.encode(df_train["smiles"], df_train["activity"], torch.float32)
dataset_test_single = enc_single.encode(df_test["smiles"], df_test["activity"], torch.float32)

# Get the dimensions of the encoding
dims_single = [dataset_train_single[0][0].shape[0]]

#### Dual-set (MSR2)

In [4]:
enc_dual = DualSetEncoder()
dataset_train_dual = enc_dual.encode(df_train["smiles"], df_train["activity"], torch.float32)
dataset_test_dual = enc_dual.encode(df_test["smiles"], df_test["activity"], torch.float32)

# Get the dimensions of the encoding
dims_dual = [
    len(dataset_train_dual[0][0][0]),
    len(dataset_train_dual[0][1][0])
]

#### Set-enhanced GNN (SR-GNN)

In [5]:
enc_graph = GraphEncoder()
dataset_train_graph = enc_graph.encode(df_train["smiles"], df_train["activity"], label_dtype=torch.float32)
dataset_test_graph = enc_graph.encode(df_test["smiles"], df_test["activity"], label_dtype=torch.float32)

# Get the dimensions of the encoding
dims_graph = [
    dataset_train_graph.dataset[0].num_node_features,
    dataset_train_graph.dataset[0].num_edge_features,
]

100%|██████████| 2113/2113 [00:06<00:00, 312.08it/s]
100%|██████████| 529/529 [00:01<00:00, 295.98it/s]


### Get torch data loaders

#### Single-set (MSR1)

In [None]:
train_loader_single = DataLoader(
    dataset_train_single,
    batch_size=64,
    shuffle=True,
    num_workers=cpu_count() if cpu_count() < 8 else 8,
    drop_last=True
)

test_loader_single = DataLoader(
    dataset_test_single,
    batch_size=64,
    shuffle=True,
    num_workers=cpu_count() if cpu_count() < 8 else 8,
    drop_last=True
)

#### Dual-set (MSR2)

In [None]:
train_loader_dual = DataLoader(
    dataset_train_dual,
    batch_size=64,
    shuffle=True,
    num_workers=cpu_count() if cpu_count() < 8 else 8,
    drop_last=True
)

test_loader_dual = DataLoader(
    dataset_test_dual,
    batch_size=64,
    shuffle=True,
    num_workers=cpu_count() if cpu_count() < 8 else 8,
    drop_last=True
)

#### Set-enhanced GNN (SR-GNN)

In [6]:
train_loader_graph = dataset_train_graph
test_loader_graph = dataset_test_graph

## Train

### Initialise the model

#### Single-set (MSR1)

In [None]:
model_single = LightningSRRegressor([64], [8], dims_single)

#### Dual-set (MSR2)

In [None]:
model_dual = LightningDualSRRegressor([64, 64], [8, 8], dims_dual)

#### Set-enhanced GNN (SR-GNN)

In [7]:
model_graph = LightningSRGNNRegressor(
    [128, 128], [64, 64],
    n_hidden_channels=[128, 64],
    n_in_channels=dims_graph[0], 
    n_edge_channels=dims_graph[1],
    n_layers=8
)

### Initialise the trainer and fit

#### Single-set (MSR1)

In [None]:
trainer_single = pl.Trainer(
    max_epochs=250,
)

trainer_single.fit(model_dual, train_dataloaders=train_loader_single)
trainer_single.test(dataloaders=test_loader_single)

#### Dual-set (MSR2)

In [None]:
trainer_dual = pl.Trainer(
    max_epochs=250,
)

trainer_dual.fit(model_dual, train_dataloaders=train_loader_dual)
trainer_dual.test(dataloaders=test_loader_dual)

#### Set-enhanced GNN (SR-GNN)

In [8]:
trainer_graph = pl.Trainer(
    max_epochs=900,
)

trainer_graph.fit(model_graph, train_dataloaders=train_loader_graph)
trainer_graph.test(dataloaders=test_loader_graph)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
You are using a CUDA device ('NVIDIA GeForce RTX 4070 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name          | Type              | Params
-----------------------------------------------------
0  | gnn_regressor | SRGNNRegressor    | 1.5 M 
1  | train_r2      | R2Score           | 0     
2  | train_pearson | PearsonCorrCoef   | 0     
3  | train_rmse    | MeanSquaredError  | 0     
4  | train_mae     | MeanAbsoluteError | 0     
5  | val_r2        | R2Score           | 0     
6  | val_pearson   | PearsonCo

Epoch 899: 100%|██████████| 33/33 [00:00<00:00, 68.16it/s, v_num=0]

`Trainer.fit` stopped: `max_epochs=900` reached.


Epoch 899: 100%|██████████| 33/33 [00:00<00:00, 63.03it/s, v_num=0]


  rank_zero_warn(
Restoring states from the checkpoint path at /home/daenu/Code/molsetrep/example/lightning_logs/version_0/checkpoints/epoch=899-step=29700.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/daenu/Code/molsetrep/example/lightning_logs/version_0/checkpoints/epoch=899-step=29700.ckpt


Testing DataLoader 0: 100%|██████████| 8/8 [00:00<00:00, 166.48it/s]


[{'test/loss': 0.2152581512928009,
  'test/r2': 0.5697703957557678,
  'test/pearson': 0.7639975547790527,
  'test/rmse': 0.4639592170715332,
  'test/mae': 0.34112948179244995}]