In [1]:
%%capture
!pip install torch_geometric
!pip install rdkit
!pip install --quiet optuna

In [2]:
import torch
import optuna
from optuna.trial import TrialState
import pickle

import numpy as np
import pandas as pd
import random


from math import sqrt

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import random_split

from torch_geometric.data import Data
from torch_geometric.nn import AttentiveFP

import os
from torch_geometric.utils import from_smiles
from torch_geometric.loader import DataLoader

In [3]:
def seed_set(seed=50):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

seed_set()

In [4]:
df_final = pd.read_csv('Lipophilicity_final.csv')

graph_list = []
for i, smile in enumerate(df_final['smiles']):
  g = from_smiles(smile)
  g.x = g.x.float()
  y = torch.tensor(df_final['exp'][i], dtype=torch.float).view(1, -1)
  g.y = y
  graph_list.append(g)


In [5]:
train_ratio = 0.80  # 80% for training, 20% for testing
dataset_size = len(graph_list)
train_size = int(train_ratio * dataset_size)
test_size = dataset_size - train_size

In [6]:
def objective(trial):
    # Define hyperparameters to be tuned
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=False)
    weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-3, log=False)
    dropout = trial.suggest_float('dropout', 0.0, 0.5, step = 0.1)
    num_layers = trial.suggest_int('num_layers', 2, 6)
    hidden_channels = trial.suggest_int('hidden_channels', 32, 192, step=32)
    # optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    batch_size = trial.suggest_int('batch_size', 16, 128, step=16)

    # Split the dataset into train and test subsets
    generator1 = torch.Generator().manual_seed(42)
    train_dataset, test_dataset = random_split(graph_list, [train_size, test_size], generator=generator1)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = AttentiveFP(in_channels=9, hidden_channels=hidden_channels, out_channels=1,
                        edge_dim=3, num_layers=num_layers, num_timesteps=2,
                        dropout=dropout).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr,
                                weight_decay=weight_decay)


    # Training loop

    def train():
        total_loss = total_examples = 0
        for data in train_loader:
            data = data.to(device)
            optimizer.zero_grad()
            out = model(data.x, data.edge_index, data.edge_attr, data.batch)
            loss = F.mse_loss(out, data.y)
            loss.backward()
            optimizer.step()
            total_loss += float(loss) * data.num_graphs
            total_examples += data.num_graphs
        return sqrt(total_loss / total_examples)

    @torch.no_grad()
    def test(loader):
        mse = []
        for data in loader:
            data = data.to(device)
            out = model(data.x, data.edge_index, data.edge_attr,data.batch)
            l = F.mse_loss(out, data.y, reduction='none').cpu()
            mse.append(l)
        rmse = float(torch.cat(mse, dim=0).mean().sqrt())
        return rmse

    for epoch in range(75):
        train_rmse = train()
        test_rmse = test(test_loader)

        trial.report(test_rmse, epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.TrialPruned()

    return test_rmse

In [None]:
# Create a study object and optimize the objective function
study = optuna.create_study(direction='minimize',
                            study_name = 'hyperparameter-tune-afp',
                            storage = 'sqlite:///htune_afp.db'
                            )
study.optimize(objective, n_trials=100)


In [18]:
study = optuna.load_study(study_name='hyperparameter-tune-afp', storage="sqlite:///htune_afp.db")

In [19]:
best_params = study.best_params
print("Best hyperparameters:", best_params)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Best hyperparameters: {'lr': 0.001867322759986135, 'weight_decay': 0.0003126662000605776, 'dropout': 0.0, 'num_layers': 6, 'hidden_channels': 64, 'batch_size': 96}
Study statistics: 
  Number of finished trials:  100
  Number of pruned trials:  75
  Number of complete trials:  25
Best trial:
  Value:  0.6075412631034851
  Params: 
    lr: 0.001867322759986135
    weight_decay: 0.0003126662000605776
    dropout: 0.0
    num_layers: 6
    hidden_channels: 64
    batch_size: 96
