In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
import chython
import chytorch
from torch.utils.data import DataLoader, Dataset
from chytorch.nn import MoleculeEncoder
from chython import smiles
from chytorch.utils import data
from torch.optim.lr_scheduler import ExponentialLR

# from sklearn.metrics import roc_auc_score, balanced_accuracy_score, f1_score, recall_score, precision_score
# from sklearn.model_selection import train_test_split

In [2]:
def mols2containers(path_to_smi):
    mol_containers = []
    for r in chython.SMILESRead(path_to_smi):
        r.canonicalize()  # fix aromaticity and functional groups
        mol_containers.append(r)
    return mol_containers

In [3]:
MoleculeEncoder()

MoleculeEncoder(
  (atoms_encoder): Embedding(121, 1024, padding_idx=0)
  (centrality_encoder): Embedding(17, 1024, padding_idx=0)
  (spatial_encoder): Embedding(13, 16, padding_idx=0)
  (layer): EncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
    )
    (linear1): Linear(in_features=1024, out_features=3072, bias=True)
    (linear2): Linear(in_features=3072, out_features=1024, bias=True)
    (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
    (dropout3): Dropout(p=0.1, inplace=False)
    (activation): GELU(approximate=none)
  )
)

In [4]:
class Network(torch.nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.input_enсoder = MoleculeEncoder()
        сумма
        self.linear1 = nn.Linear(in_features=1024, out_features=512)
        self.dropout1 = nn.Dropout(p=0.1)
        self.linear2 = nn.Linear(in_features=512, out_features=1) #https://stats.stackexchange.com/questions/207049/neural-network-for-binary-classification-use-1-or-2-output-neurons

        
    def forward(self, X):
        E = self.input_enсoder(X)
        
        X = self.linear1(E) # pass data through linear layer №1
        X = torch.relu(X) # use the activation function over X
        
        X = self.dropout1(X)
        
        X = self.linear2(X)
        X = torch.relu(X)
        
        return X

In [5]:
df = pd.read_csv("")
df = df[['std_smiles', 'activity', 'dataset']]
mol_containers_train = [smiles(i) for i in df[df.dataset == 'train'].std_smiles[:100]]
y_train = df[df.dataset == 'train'].activity[:100]

In [6]:
print(len(mol_containers_train), len(y_train))

100 100


In [5]:
# train_set = Dataset(mol_containers_train, y_train.to_list())

# class Data(Dataset):
#     def __init__(self, X_train, y_train):
#         self.X = X_train
#         self.y = torch.from_numpy(y_train.to_numpy().astype(np.float32))
# #         self.len = self.X.shape[0]

#     def __getitem__(self, index):
#         return self.X[index], self.y[index]

#     def __len__(self):
#         return self.len

In [7]:
BATCH_SIZE = 10 #64
LEARNING_RATE = 0.01
EPOCHS = 3

# mol_containers = mols2containers('../docs/input-files/mols-for-prediction.smi')
train_set_X = data.MoleculeDataset(mol_containers_train, add_cls=True)
dataset_loader = DataLoader(train_set_X, collate_fn=data.collate_molecules, batch_size=BATCH_SIZE)

network = Network()
loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(network.parameters(), lr=LEARNING_RATE)
lr_scheduler = ExponentialLR(optimizer, gamma=0.9) #gamma=0.9, just from example https://pytorch.org/docs/stable/optim.html

In [12]:
train_set_X = data.MoleculeDataset(mol_containers_train, add_cls=True)
data_loader = torch.utils.data.DataLoader(train_set_X, collate_fn=data.collate_molecules, batch_size=10)

In [17]:
network

Network(
  (input_enсoder): MoleculeEncoder(
    (atoms_encoder): Embedding(121, 1024, padding_idx=0)
    (centrality_encoder): Embedding(17, 1024, padding_idx=0)
    (spatial_encoder): Embedding(13, 16, padding_idx=0)
    (layer): EncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
      )
      (linear1): Linear(in_features=1024, out_features=3072, bias=True)
      (linear2): Linear(in_features=3072, out_features=1024, bias=True)
      (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (dropout3): Dropout(p=0.1, inplace=False)
      (activation): GELU(approximate=none)
    )
  )
  (linear1): Linear(in_features=1024, out_features=512, bias=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (linear2): Li

In [19]:
epoch_loss = []
epoch_roc = []
epoch_internal_roc = []

for epoch in tqdm(range(EPOCHS)):
    running_loss = []
    
    for inputs, labels in zip(dataset_loader, torch.from_numpy(y_train.to_numpy().astype(np.float32))):
        
         # sets the gradients to zero.
        optimizer.zero_grad()
        
        # calculate output
        print(len(inputs))
        print(inputs)
        outputs = network(inputs)
        print(len(outputs), len(outputs[0]))
        
        # calculate loss
        loss = loss_func(outputs.squeeze(-1), labels)
        
        # backprop
        loss.backward()
        optimizer.step()
        
        # count statistics
        running_loss.append(loss.item())
        
    # because learning rate is exponential
    lr_scheduler.step()
    
    y_test_internal_pred = network(X_test_internal)
    
    # just for memory reducing 
    with torch.no_grad():
        epoch_loss.append(round((sum(running_loss) / len(running_loss)), 3))
        epoch_roc.append(roc_auc_score(y_true=train_set.y.detach().numpy(), 
                                       y_score=network(train_set.X).detach().numpy()))
        epoch_internal_roc.append(roc_auc_score(y_true=y_test_internal, y_score=y_test_internal_pred.detach().numpy()))
        

  0%|                                                                                                                                                                                 | 0/3 [00:00<?, ?it/s]

3
(tensor([[ 1, 37,  8,  8,  9,  9,  8,  8,  9,  8,  8,  8,  8,  8,  8,  8,  8,  8,
          8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0],
        [ 1, 37,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  8,  8,  9,  8,  8,
          8,  8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0],
        [ 1, 37,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  8,  8,  9,  8,  8,
          8,  8,  8,  8,  8,  8,  0,  0,  0,  0,  0,  0,  0],
        [ 1, 37,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  8,  8,  9,  8,  8,
          8,  8,  8,  9,  8,  8,  0,  0,  0,  0,  0,  0,  0],
        [ 1, 37,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  8,  8,  8,  8,  9,
          9,  8,  9,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1, 37,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  8,  9,  8,  9,  8,
          8, 10,  8,  8,  9,  8,  9,  8,  8, 10,  8,  8,  9],
        [ 1, 37,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  8,  9,  8,  8,  8,
          8,  8,  8,  9,  0,  0,  0,  0,  0,  0,  0,  0,  

  0%|                                                                                                                                                                                 | 0/3 [00:00<?, ?it/s]

10 31





ValueError: Using a target size (torch.Size([])) that is different to the input size (torch.Size([10, 31])) is deprecated. Please ensure they have the same size.

In [34]:
data1 = []
for r1 in chython.SMILESRead('/Users/khakimova/Desktop/bi-code/chem-predictions-herg/docs/input-files/mols-for-prediction.smi'):
    r1.canonicalize()  # fix aromaticity and functional groups
    data1.append(r1)

    
ds1 = chytorch.utils.data.MoleculeDataset(data1)
dl1 = torch.utils.data.DataLoader(ds1, collate_fn=chytorch.utils.data.collate_molecules, batch_size=2)

In [35]:
encoder = chytorch.nn.MoleculeEncoder()

for i, b in enumerate(dl1):
    print(encoder(b))
    print(i)

tensor([[[-0.3719, -0.8104, -0.0421,  ..., -0.3444, -1.5525, -0.6976],
         [ 0.1037, -0.9255,  0.7589,  ..., -1.1748, -1.2558, -0.9303],
         [ 0.3078, -1.3107,  0.6972,  ..., -0.6501, -1.5743, -0.8690],
         ...,
         [ 0.0336, -0.9558,  0.5205,  ..., -0.6627, -1.1989, -0.6504],
         [ 0.0565, -1.1160,  0.7482,  ..., -1.1586, -1.0908, -0.6362],
         [-0.0206, -1.0806,  0.6765,  ..., -0.9838, -1.2143, -1.1039]],

        [[-0.5408, -0.2863,  1.0533,  ..., -1.0741, -0.6845, -0.6899],
         [-0.5157, -0.6347,  1.0107,  ..., -1.7792, -0.6318, -0.2591],
         [-0.0145, -0.5943,  0.5960,  ..., -1.5441,  0.0184, -0.0134],
         ...,
         [-0.1430, -0.5330,  0.8573,  ..., -2.3832, -0.4230, -0.2200],
         [-0.4149, -0.6341,  2.3221,  ..., -1.7311, -1.0508,  0.0051],
         [-0.3550,  0.1649, -0.4519,  ..., -0.8924, -0.4228, -0.8496]]],
       grad_fn=<NativeLayerNormBackward0>)
0


In [50]:
[i for i in dl1]

[(tensor([[1, 8, 8, 8, 8, 8, 8],
          [1, 8, 8, 8, 8, 9, 0]], dtype=torch.int32),
  tensor([[0, 5, 5, 5, 5, 5, 5],
          [0, 6, 6, 6, 6, 5, 0]], dtype=torch.int32),
  tensor([[[1, 1, 1, 1, 1, 1, 1],
           [1, 2, 3, 4, 5, 4, 3],
           [1, 3, 2, 3, 4, 5, 4],
           [1, 4, 3, 2, 3, 4, 5],
           [1, 5, 4, 3, 2, 3, 4],
           [1, 4, 5, 4, 3, 2, 3],
           [1, 3, 4, 5, 4, 3, 2]],
  
          [[1, 1, 1, 1, 1, 1, 0],
           [1, 2, 3, 4, 5, 6, 0],
           [1, 3, 2, 3, 4, 5, 0],
           [1, 4, 3, 2, 3, 4, 0],
           [1, 5, 4, 3, 2, 3, 0],
           [1, 6, 5, 4, 3, 2, 0],
           [1, 0, 0, 0, 0, 0, 0]]], dtype=torch.int32))]