In [1]:
import torch

from torch.utils.data import DataLoader
from torcheval.metrics import BinaryAccuracy, BinaryAUROC
from torchmetrics.regression import R2Score, MeanSquaredError, MeanAbsoluteError

from molsetrep.utils.torch_trainer import TorchTrainer
from molsetrep.utils.datasets import molnet_loader
from molsetrep.utils.converters import molnet_to_pyg
from molsetrep.utils.root_mean_squared_error import RootMeanSquaredError
from molsetrep.utils.imbalanced_sampler import ImbalancedSampler
from molsetrep.models import SetRepClassifier, SetRepRegressor
from molsetrep.encoders import SECMQNFPEncoder, SECFPEncoder, Mol2VecEncoder, Mol2SetEncoder

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


# SECMQNFP

## Classification

In [None]:
train, valid, test = molnet_loader("bace_classification")
enc = SECMQNFPEncoder()

train_dataset = enc.encode(train.ids, [y[0] for y in train.y], label_dtype=torch.long)
valid_dataset = enc.encode(valid.ids, [y[0] for y in valid.y], label_dtype=torch.long)
test_dataset = enc.encode(test.ids, [y[0] for y in test.y], label_dtype=torch.long)

train_loader = DataLoader(train_dataset, batch_size=64, sampler=ImbalancedSampler(train_dataset))
valid_loader = DataLoader(valid_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

In [None]:
d = 44
model = SetRepClassifier(100, 16, d, 2)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
criterion = torch.nn.NLLLoss()

trainer = TorchTrainer(
    model,
    optimizer,
    criterion,
    200,
    [BinaryAccuracy(), BinaryAUROC()],
    [BinaryAccuracy(), BinaryAUROC()],
    [BinaryAccuracy(), BinaryAUROC()],
    # scheduler=scheduler,
    monitor_metric=1,
    monitor_lower_is_better=False
)

trainer.train(train_loader, valid_loader)
trainer.test(test_loader)

## Regression

In [None]:
train, valid, test = molnet_loader("lipo")
enc = SECMQNFPEncoder()

train_dataset = enc.encode(train.ids, [y[0] for y in train.y], label_dtype=torch.float)
valid_dataset = enc.encode(valid.ids, [y[0] for y in valid.y], label_dtype=torch.float)
test_dataset = enc.encode(test.ids, [y[0] for y in test.y], label_dtype=torch.float)

train_loader = DataLoader(train_dataset, batch_size=64)
valid_loader = DataLoader(valid_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

In [None]:
d = 44
model = SetRepRegressor(8, 32, d)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
criterion = torch.nn.MSELoss()

trainer = TorchTrainer(
    model,
    optimizer,
    criterion,
    200,
    [R2Score(), RootMeanSquaredError()],
    [R2Score(), RootMeanSquaredError()],
    [R2Score(), RootMeanSquaredError()],
    scheduler=scheduler,
    monitor_metric=1,
    monitor_lower_is_better=True
)

trainer.train(train_loader, valid_loader)
trainer.test(test_loader)

# SECFP

## Classification

In [2]:
train, valid, test = molnet_loader("bace_classification")
graph_embedder = SECFPEncoder.fit(train.ids)

enc = SECFPEncoder()

train_dataset = enc.encode(train.ids, [y[0] for y in train.y], label_dtype=torch.long, graph_embedder=graph_embedder, pretrained_graph_embedder=True)
valid_dataset = enc.encode(valid.ids, [y[0] for y in valid.y], label_dtype=torch.long, graph_embedder=graph_embedder, pretrained_graph_embedder=True)
test_dataset = enc.encode(test.ids, [y[0] for y in test.y], label_dtype=torch.long, graph_embedder=graph_embedder, pretrained_graph_embedder=True)

train_loader = DataLoader(train_dataset, batch_size=64, sampler=ImbalancedSampler(train_dataset))
valid_loader = DataLoader(valid_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

                                                                                        

In [None]:
d = 500
model = SetRepClassifier(2, 16, d, 2)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
criterion = torch.nn.NLLLoss()

trainer = TorchTrainer(
    model,
    optimizer,
    criterion,
    200,
    [BinaryAccuracy(), BinaryAUROC()],
    [BinaryAccuracy(), BinaryAUROC()],
    [BinaryAccuracy(), BinaryAUROC()],
    # scheduler=scheduler,
    monitor_metric=1,
    monitor_lower_is_better=False
)

trainer.train(train_loader, valid_loader)
trainer.test(test_loader)

## Regression

# Mol2Vec

## Classification

In [5]:
train, valid, test = molnet_loader("bace_classification")
enc = Mol2SetEncoder()

train_dataset = enc.encode(train.ids, [y[0] for y in train.y], label_dtype=torch.long)
valid_dataset = enc.encode(valid.ids, [y[0] for y in valid.y], label_dtype=torch.long)
test_dataset = enc.encode(test.ids, [y[0] for y in test.y], label_dtype=torch.long)

train_loader = DataLoader(train_dataset, batch_size=64, sampler=ImbalancedSampler(train_dataset))
valid_loader = DataLoader(valid_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

In [None]:
d = 300
model = SetRepClassifier(100, 16, d, 2)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
criterion = torch.nn.NLLLoss()

trainer = TorchTrainer(
    model,
    optimizer,
    criterion,
    60,
    [BinaryAccuracy(), BinaryAUROC()],
    [BinaryAccuracy(), BinaryAUROC()],
    [BinaryAccuracy(), BinaryAUROC()],
    # scheduler=scheduler,
    monitor_metric=1,
    monitor_lower_is_better=False
)

trainer.train(train_loader, valid_loader)
trainer.test(test_loader)

## Regression

In [26]:
train, valid, test = molnet_loader("delaney", splitter="random")
enc = Mol2SetEncoder()

train_dataset = enc.encode(train.ids, [y[0] for y in train.y], label_dtype=torch.float)
valid_dataset = enc.encode(valid.ids, [y[0] for y in valid.y], label_dtype=torch.float)
test_dataset = enc.encode(test.ids, [y[0] for y in test.y], label_dtype=torch.float)

train_loader = DataLoader(train_dataset, batch_size=64)
valid_loader = DataLoader(valid_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

In [27]:
d = 300
model = SetRepRegressor(100, 16, d)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
criterion = torch.nn.MSELoss()

trainer = TorchTrainer(
    model,
    optimizer,
    criterion,
    200,
    [R2Score(), MeanSquaredError(squared=False), MeanAbsoluteError()],
    [R2Score(), MeanSquaredError(squared=False), MeanAbsoluteError()],
    [R2Score(), MeanSquaredError(squared=False), MeanAbsoluteError()],
    scheduler=scheduler,
    monitor_metric=1,
    monitor_lower_is_better=True
)

trainer.train(train_loader, valid_loader)
trainer.test(test_loader)

*  Epoch 1: Train loss: 0.675 (R2Score: 0.302, MeanSquaredError: 0.836, MeanAbsoluteError: 0.662)  Valid loss: 0.881 (R2Score: -0.012, MeanSquaredError: 0.93, MeanAbsoluteError: 0.807)
*  Epoch 2: Train loss: 0.341 (R2Score: 0.644, MeanSquaredError: 0.597, MeanAbsoluteError: 0.479)  Valid loss: 0.288 (R2Score: 0.663, MeanSquaredError: 0.537, MeanAbsoluteError: 0.424)
*  Epoch 3: Train loss: 0.27 (R2Score: 0.719, MeanSquaredError: 0.53, MeanAbsoluteError: 0.419)  Valid loss: 0.241 (R2Score: 0.718, MeanSquaredError: 0.491, MeanAbsoluteError: 0.386)
|  Epoch 4: Train loss: 0.233 (R2Score: 0.759, MeanSquaredError: 0.491, MeanAbsoluteError: 0.385)  Valid loss: 0.249 (R2Score: 0.706, MeanSquaredError: 0.501, MeanAbsoluteError: 0.41)
*  Epoch 5: Train loss: 0.195 (R2Score: 0.799, MeanSquaredError: 0.449, MeanAbsoluteError: 0.352)  Valid loss: 0.179 (R2Score: 0.792, MeanSquaredError: 0.421, MeanAbsoluteError: 0.329)
*  Epoch 6: Train loss: 0.164 (R2Score: 0.831, MeanSquaredError: 0.411, MeanAb