In [1]:
import torch
import numpy as np

from torch.utils.data import DataLoader
from torcheval.metrics import BinaryAccuracy, BinaryAUROC
from torchmetrics.regression import R2Score, MeanSquaredError, MeanAbsoluteError

from molsetrep.utils.torch_trainer import TorchTrainer
from molsetrep.utils.datasets import molnet_loader
from molsetrep.utils.converters import molnet_to_pyg
from molsetrep.utils.root_mean_squared_error import RootMeanSquaredError
from molsetrep.utils.imbalanced_sampler import ImbalancedSampler
from molsetrep.models import SetRepClassifier, SetRepRegressor
from molsetrep.encoders import SECMQNFPEncoder, SECFPEncoder, Mol2VecEncoder, Mol2SetEncoder

from sklearn.preprocessing import StandardScaler


Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


# SECMQNFP

## Classification

In [None]:
train, valid, test = molnet_loader("bbbp", splitter="scaffold", reload=False)
enc = SECMQNFPEncoder()

train_dataset, valid_dataset, test_dataset = enc.encode(
    (train.ids, valid.ids, test.ids), 
    ([y[0] for y in train.y], [y[0] for y in valid.y], [y[0] for y in test.y]), 
    label_dtype=torch.long, standardize=False
)

train_loader = DataLoader(train_dataset, batch_size=64, sampler=ImbalancedSampler(train_dataset))
valid_loader = DataLoader(valid_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

In [None]:
d = 42

results = []
for _ in range(5):
    model = SetRepClassifier(8, 2, d, 2)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    criterion = torch.nn.NLLLoss()
    
    trainer = TorchTrainer(
        model,
        optimizer,
        criterion,
        100,
        [BinaryAccuracy(), BinaryAUROC()],
        [BinaryAccuracy(), BinaryAUROC()],
        [BinaryAccuracy(), BinaryAUROC()],
        # scheduler=scheduler,
        monitor_metric=1,
        monitor_lower_is_better=False,
        silent=True
    )

    trainer.train(train_loader, valid_loader)
    print(trainer.test(test_loader))

# [{'best_epoch': 60, 'loss': 1.8466728925704956, 'BinaryAccuracy': 0.6078431606292725, 'BinaryAUROC': 0.6049715772232392}]
# [{'best_epoch': 60, 'loss': 1.361539602279663, 'BinaryAccuracy': 0.6078431606292725, 'BinaryAUROC': 0.6015993833702669}]
# [{'best_epoch': 60, 'loss': 1.2250711917877197, 'BinaryAccuracy': 0.5833333134651184, 'BinaryAUROC': 0.5791983813469506}]
# [{'best_epoch': 60, 'loss': 1.1215720176696777, 'BinaryAccuracy': 0.5735294222831726, 'BinaryAUROC': 0.5664803931014548}]
# [{'best_epoch': 55, 'loss': 1.2314069271087646, 'BinaryAccuracy': 0.593137264251709, 'BinaryAUROC': 0.5914346276134502}]

## Regression

In [4]:
train, valid, test = molnet_loader("lipo")
enc = SECMQNFPEncoder()

train_dataset, valid_dataset, test_dataset = enc.encode(
    (train.ids, valid.ids, test.ids), 
    ([y[0] for y in train.y], [y[0] for y in valid.y], [y[0] for y in test.y]), 
    label_dtype=torch.float, standardize=False
)

train_loader = DataLoader(train_dataset, batch_size=64)
valid_loader = DataLoader(valid_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

In [None]:
d = 42
model = SetRepRegressor(8, 32, d)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
criterion = torch.nn.MSELoss()

trainer = TorchTrainer(
    model,
    optimizer,
    criterion,
    200,
    [R2Score(), RootMeanSquaredError()],
    [R2Score(), RootMeanSquaredError()],
    [R2Score(), RootMeanSquaredError()],
    scheduler=scheduler,
    monitor_metric=1,
    monitor_lower_is_better=True
)

trainer.train(train_loader, valid_loader)
trainer.test(test_loader)

# SECFP

## Classification

In [4]:
train, valid, test = molnet_loader("bace_classification")
graph_embedder = SECFPEncoder.fit(train.ids)

enc = SECFPEncoder()

train_dataset = enc.encode(train.ids, [y[0] for y in train.y], label_dtype=torch.long, graph_embedder=graph_embedder, pretrained_graph_embedder=True)
valid_dataset = enc.encode(valid.ids, [y[0] for y in valid.y], label_dtype=torch.long, graph_embedder=graph_embedder, pretrained_graph_embedder=True)
test_dataset = enc.encode(test.ids, [y[0] for y in test.y], label_dtype=torch.long, graph_embedder=graph_embedder, pretrained_graph_embedder=True)

train_loader = DataLoader(train_dataset, batch_size=64, sampler=ImbalancedSampler(train_dataset))
valid_loader = DataLoader(valid_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

                                                                                        

In [None]:
d = 500
model = SetRepClassifier(2, 8, d, 2)

optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
criterion = torch.nn.NLLLoss()

trainer = TorchTrainer(
    model,
    optimizer,
    criterion,
    60,
    [BinaryAccuracy(), BinaryAUROC()],
    [BinaryAccuracy(), BinaryAUROC()],
    [BinaryAccuracy(), BinaryAUROC()],
    # scheduler=scheduler,
    monitor_metric=1,
    monitor_lower_is_better=False
)

trainer.train(train_loader, valid_loader)
trainer.test(test_loader)

## Regression

# Mol2Vec

## Classification

In [2]:
train, valid, test = molnet_loader("bbbp")
enc = Mol2SetEncoder()

train_dataset = enc.encode(train.ids, [y[0] for y in train.y], label_dtype=torch.long)
valid_dataset = enc.encode(valid.ids, [y[0] for y in valid.y], label_dtype=torch.long)
test_dataset = enc.encode(test.ids, [y[0] for y in test.y], label_dtype=torch.long)

train_loader = DataLoader(train_dataset, batch_size=128, sampler=ImbalancedSampler(train_dataset))
valid_loader = DataLoader(valid_dataset, batch_size=128)
test_loader = DataLoader(test_dataset, batch_size=128)

In [4]:
d = 300
model = SetRepClassifier(8, 16, d, 2)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
criterion = torch.nn.NLLLoss()

trainer = TorchTrainer(
    model,
    optimizer,
    criterion,
    60,
    [BinaryAccuracy(), BinaryAUROC()],
    [BinaryAccuracy(), BinaryAUROC()],
    [BinaryAccuracy(), BinaryAUROC()],
    # scheduler=scheduler,
    monitor_metric=1,
    monitor_lower_is_better=False
)

trainer.train(train_loader, valid_loader)
trainer.test(test_loader)

*  Epoch 1: Train loss: 0.706 (BinaryAccuracy: 0.494, BinaryAUROC: 0.5)  Valid loss: 0.685 (BinaryAccuracy: 0.451, BinaryAUROC: 0.5)
|  Epoch 2: Train loss: 0.711 (BinaryAccuracy: 0.479, BinaryAUROC: 0.5)  Valid loss: 0.685 (BinaryAccuracy: 0.451, BinaryAUROC: 0.5)
|  Epoch 3: Train loss: 0.706 (BinaryAccuracy: 0.491, BinaryAUROC: 0.5)  Valid loss: 0.685 (BinaryAccuracy: 0.451, BinaryAUROC: 0.5)
|  Epoch 4: Train loss: 0.704 (BinaryAccuracy: 0.498, BinaryAUROC: 0.5)  Valid loss: 0.685 (BinaryAccuracy: 0.451, BinaryAUROC: 0.5)
|  Epoch 5: Train loss: 0.698 (BinaryAccuracy: 0.516, BinaryAUROC: 0.5)  Valid loss: 0.685 (BinaryAccuracy: 0.451, BinaryAUROC: 0.5)
|  Epoch 6: Train loss: 0.705 (BinaryAccuracy: 0.494, BinaryAUROC: 0.5)  Valid loss: 0.685 (BinaryAccuracy: 0.451, BinaryAUROC: 0.5)
|  Epoch 7: Train loss: 0.705 (BinaryAccuracy: 0.49, BinaryAUROC: 0.5)  Valid loss: 0.685 (BinaryAccuracy: 0.451, BinaryAUROC: 0.5)
|  Epoch 8: Train loss: 0.697 (BinaryAccuracy: 0.519, BinaryAUROC: 0.5

[]

## Regression

In [26]:
train, valid, test = molnet_loader("delaney", splitter="random")
enc = Mol2SetEncoder()

train_dataset = enc.encode(train.ids, [y[0] for y in train.y], label_dtype=torch.float)
valid_dataset = enc.encode(valid.ids, [y[0] for y in valid.y], label_dtype=torch.float)
test_dataset = enc.encode(test.ids, [y[0] for y in test.y], label_dtype=torch.float)

train_loader = DataLoader(train_dataset, batch_size=64)
valid_loader = DataLoader(valid_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

In [None]:
d = 300
model = SetRepRegressor(100, 16, d)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
criterion = torch.nn.MSELoss()

trainer = TorchTrainer(
    model,
    optimizer,
    criterion,
    200,
    [R2Score(), MeanSquaredError(squared=False), MeanAbsoluteError()],
    [R2Score(), MeanSquaredError(squared=False), MeanAbsoluteError()],
    [R2Score(), MeanSquaredError(squared=False), MeanAbsoluteError()],
    scheduler=scheduler,
    monitor_metric=1,
    monitor_lower_is_better=True
)

trainer.train(train_loader, valid_loader)
trainer.test(test_loader)