In [1]:
import pickle
# from model import PositionalEncoding, SELFIES_Transformer
from model import SelfiesTransformer

In [2]:
from custom_selfies_dataset import CustomSELFIESDataset

In [3]:
from torch.utils.data import DataLoader

In [4]:
import torch
import os

In [5]:
import torch.nn as nn

In [6]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [7]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [8]:
import numpy as np

## Train model

In [9]:
import copy
import time

In [10]:
from sklearn.model_selection import train_test_split

## BBBP

In [23]:
with open("bbbp_data.pickle", "rb") as f:
    X = pickle.load(f)

In [24]:
with open("bbbp_label.pickle", "rb") as f:
    y = pickle.load(f)

In [25]:
print(len(X), len(y))

1996 1996


In [26]:
print(y.count(1)) # number of positive samples
print(y.count(0)) # number of negative samples

1553
443


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
train_dataset = CustomSELFIESDataset(X_train, y_train)
test_dataset = CustomSELFIESDataset(X_test, y_test)
train_dataloader = DataLoader(train_dataset, batch_size=10, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=10, shuffle=True)

In [30]:
train_dataset.__getitem__(0)

(tensor([36, 15, 29, 65, 36, 36, 29, 64, 18, 36, 36, 55, 29, 64, 13, 36, 28, 15,
         36, 36, 28, 14, 36, 15, 36, 15, 36, 15, 64, 13, 20, 36, 36, 36, 64,  4,
         36, 61, 36, 15, 36, 15, 65, 64, 14, 79, 79, 79, 79, 79, 79, 79, 79, 79,
         79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
         79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79,
         79, 79, 79, 79, 79, 79, 79, 79, 79, 79]),
 tensor(1.))

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
with open("symbol2idx_bbbp.pickle", "rb") as f:
    symbol2idx_bbbp = pickle.load(f)

In [20]:
len(X_train[0])

100

In [21]:
config = {
    "vocab_dict": symbol2idx_bbbp,
    "max_length": len(X_train[0]),
    "dim": 32,
    "n_classes": 1, # binary classification
    "heads": 2,
    "mlp_dim": 16,
    "depth": 2,
    "dim_head": 32,
    "dropout": 0.1,
    "emb_dropout": 0.1
}

In [22]:
model = SelfiesTransformer(**config)

In [23]:
# out = model(X_train[:10])

In [24]:
criterion = nn.BCEWithLogitsLoss().to(device)

In [25]:
lr = 0.0001
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [26]:
softmax = nn.Softmax(dim=-1)

In [27]:
model.to(device)
for epoch in range(30):
    train_loss = []
    model.train()
    train_correct = 0
    # temp = 0
    for i, data in enumerate(train_dataloader, 0):
        inputs, labels = data
        # temp += len(inputs)
        inputs = inputs.to(device)
        labels = labels.unsqueeze(-1).to(device)
        #print(inputs.get_device())
        #print(labels.get_device())
        optimizer.zero_grad()
        outputs = model(inputs)
        
        labels_pred = softmax(outputs).argmax(1)
        
        # print(labels)
        # print(outputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        train_correct += (labels == labels_pred.unsqueeze(-1)).float().sum() # ???
        train_loss += [loss.item()]
    
    accuracy_train = train_correct / len(X_train)
    print("epoch: %04d | train loss: %.5f | train_accuracy: %.4f" %
         (epoch + 1, np.mean(train_loss), accuracy_train))
print("Finished Training")

epoch: 0001 | train loss: 0.60085 | train_accuracy: 0.2212
epoch: 0002 | train loss: 0.58838 | train_accuracy: 0.2212
epoch: 0003 | train loss: 0.57700 | train_accuracy: 0.2212
epoch: 0004 | train loss: 0.56928 | train_accuracy: 0.2212
epoch: 0005 | train loss: 0.55883 | train_accuracy: 0.2212
epoch: 0006 | train loss: 0.55324 | train_accuracy: 0.2212
epoch: 0007 | train loss: 0.54874 | train_accuracy: 0.2212
epoch: 0008 | train loss: 0.54200 | train_accuracy: 0.2212
epoch: 0009 | train loss: 0.54154 | train_accuracy: 0.2212
epoch: 0010 | train loss: 0.53797 | train_accuracy: 0.2212
epoch: 0011 | train loss: 0.53791 | train_accuracy: 0.2212
epoch: 0012 | train loss: 0.53934 | train_accuracy: 0.2212
epoch: 0013 | train loss: 0.53387 | train_accuracy: 0.2212
epoch: 0014 | train loss: 0.53239 | train_accuracy: 0.2212
epoch: 0015 | train loss: 0.53592 | train_accuracy: 0.2212
epoch: 0016 | train loss: 0.53137 | train_accuracy: 0.2212
epoch: 0017 | train loss: 0.53444 | train_accuracy: 0.22

## Lipophilicity

In [11]:
with open("lipo_data.pickle", "rb") as f:
    X = pickle.load(f)
with open("lipo_label.pickle", "rb") as f:
    y = pickle.load(f)

In [12]:
print(len(X), len(y))

4194 4194


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [15]:
print(len(X_train), len(X_val), len(X_test))

2516 839 839


In [16]:
train_dataset = CustomSELFIESDataset(X_train, y_train)
val_dataset = CustomSELFIESDataset(X_val, y_val)
test_dataset = CustomSELFIESDataset(X_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=10, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=10, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=10, shuffle=True)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
with open("symbol2idx_lipo.pickle", "rb") as f:
    symbol2idx_lipo = pickle.load(f)

In [19]:
config = {
    "vocab_dict": symbol2idx_lipo,
    "max_length": len(X_train[0]),
    "dim": 32,
    "n_classes": 1, # regression
    "heads": 2,
    "mlp_dim": 16,
    "depth": 2,
    "dim_head": 32,
    "dropout": 0.1,
    "emb_dropout": 0.1
}

In [20]:
model = SelfiesTransformer(**config)

In [21]:
criterion = nn.MSELoss()
lr = 0.001 # 0.0001
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)

In [20]:
model.to(device)
for epoch in range(30):
    train_loss = []
    model.train()
    for i, data in enumerate(train_dataloader, 0):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.unsqueeze(-1).to(device)
        #print(inputs.get_device())
        #print(labels.get_device())
        
        optimizer.zero_grad()
        outputs = model(inputs)
        
        # labels_pred = softmax(outputs).argmax(1)
        
        # print(labels)
        # print(outputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        # train_correct += (labels == labels_pred.unsqueeze(-1)).float().sum() # ???
        train_loss += [loss.item()]
    
    # accuracy_train = train_correct / len(X_train)
    print("epoch: %04d | train loss: %.5f" %
         (epoch + 1, np.mean(train_loss)))
print("Finished Training")

epoch: 0001 | train loss: 1.65021
epoch: 0002 | train loss: 1.28181
epoch: 0003 | train loss: 1.23099
epoch: 0004 | train loss: 1.18525
epoch: 0005 | train loss: 1.15601
epoch: 0006 | train loss: 1.14051
epoch: 0007 | train loss: 1.13146
epoch: 0008 | train loss: 1.10298
epoch: 0009 | train loss: 1.07579
epoch: 0010 | train loss: 1.04971
epoch: 0011 | train loss: 1.05730
epoch: 0012 | train loss: 1.04330
epoch: 0013 | train loss: 1.01596
epoch: 0014 | train loss: 1.02465
epoch: 0015 | train loss: 1.01211
epoch: 0016 | train loss: 0.99032
epoch: 0017 | train loss: 0.99361
epoch: 0018 | train loss: 0.97532
epoch: 0019 | train loss: 0.96208
epoch: 0020 | train loss: 0.96136
epoch: 0021 | train loss: 0.95580
epoch: 0022 | train loss: 0.93535
epoch: 0023 | train loss: 0.92839
epoch: 0024 | train loss: 0.94336
epoch: 0025 | train loss: 0.90548
epoch: 0026 | train loss: 0.89001
epoch: 0027 | train loss: 0.90453
epoch: 0028 | train loss: 0.90156
epoch: 0029 | train loss: 0.86714
epoch: 0030 | 

In [22]:
model.to(device)
for epoch in range(30):
    train_loss = []
    model.train()
    for i, data in enumerate(train_dataloader, 0):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.unsqueeze(-1).to(device)
        #print(inputs.get_device())
        #print(labels.get_device())
        
        optimizer.zero_grad()
        outputs = model(inputs)
        
        # labels_pred = softmax(outputs).argmax(1)
        
        # print(labels)
        # print(outputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        # train_correct += (labels == labels_pred.unsqueeze(-1)).float().sum() # ???
        train_loss += [loss.item()]
    
    with torch.no_grad():
        model.eval()
        val_loss = []
        for _, val_data in enumerate(val_dataloader, 0):
            inputs, labels = val_data
            inputs = inputs.to(device)
            labels = labels.unsqueeze(-1).to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += [loss.item()]
    # accuracy_train = train_correct / len(X_train)
    print("epoch: %04d | train loss: %.5f | valid loss: %.5f" %
         (epoch + 1, np.mean(train_loss), np.mean(val_loss)))
print("Finished Training")

epoch: 0001 | train loss: 1.70656 | valid loss: 1.45706
epoch: 0002 | train loss: 1.40156 | valid loss: 1.42241
epoch: 0003 | train loss: 1.30607 | valid loss: 1.19506
epoch: 0004 | train loss: 1.20928 | valid loss: 1.16370
epoch: 0005 | train loss: 1.17250 | valid loss: 1.20958
epoch: 0006 | train loss: 1.14189 | valid loss: 1.09602
epoch: 0007 | train loss: 1.15141 | valid loss: 1.13243
epoch: 0008 | train loss: 1.10272 | valid loss: 1.18558
epoch: 0009 | train loss: 1.13354 | valid loss: 1.11572
epoch: 0010 | train loss: 1.08082 | valid loss: 1.13081
epoch: 0011 | train loss: 1.06556 | valid loss: 1.10491
epoch: 0012 | train loss: 1.06744 | valid loss: 1.20369
epoch: 0013 | train loss: 1.05664 | valid loss: 1.11915
epoch: 0014 | train loss: 1.03099 | valid loss: 1.06311
epoch: 0015 | train loss: 1.01823 | valid loss: 1.06339
epoch: 0016 | train loss: 0.97943 | valid loss: 1.08308
epoch: 0017 | train loss: 0.98594 | valid loss: 1.14906
epoch: 0018 | train loss: 0.97017 | valid loss: 