In [1]:
!pip list

Package                   Version
------------------------- ---------------
anyio                     4.2.0
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 2.4.1
async-lru                 2.0.4
attrs                     23.2.0
Babel                     2.14.0
beautifulsoup4            4.12.3
bleach                    6.1.0
certifi                   2023.11.17
cffi                      1.16.0
charset-normalizer        3.3.2
comm                      0.2.1
contourpy                 1.2.0
cycler                    0.12.1
debugpy                   1.8.0
decorator                 5.1.1
defusedxml                0.7.1
exceptiongroup            1.2.0
executing                 2.0.1
fair-esm                  1.0.3
fastjsonschema            2.19.1
filelock                  3.13.1
fonttools                 4.47.2
fqdn                      1.5.1
fsspec                    2024.2.0
huggingface-hub           0.20.3
idna     

In [1]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertModel, BertTokenizer
from tqdm import tqdm
import numpy as np

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cuda')

In [4]:
# Define a dual-input model for protein and compound sequences
class DualInputBERTModel(nn.Module):
    def __init__(self, protein_model, compound_model, protein_dim, compound_dim, output_dim):
        super(DualInputBERTModel, self).__init__()
        self.protein_model = protein_model
        self.compound_model = compound_model
        self.fc = nn.Linear(protein_dim + compound_dim, output_dim)

    def forward(self, protein_seq, compound_seq):
        protein_features = self.protein_model(protein_seq)[1]  # Using the pooled output
        compound_features = self.compound_model(compound_seq)[1]  # Using the pooled output
        combined_features = torch.cat((protein_features, compound_features), dim=1)
        output = self.fc(combined_features)
        return output

In [9]:
# Initialize models
protein_model = BertModel.from_pretrained('bert-base-uncased')
compound_model = BertModel.from_pretrained('bert-base-uncased')
model = DualInputBERTModel(protein_model, compound_model, protein_dim=768, compound_dim=768, output_dim=1).to(device)
model.train()


DualInputBERTModel(
  (protein_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [6]:
train_sps = torch.load("data/train_sps.ids76.pt")
train_smile = torch.load("data/train_smile.ids68.pt")
train_log_ic50 = torch.load("data/train_ic50_log.pt")

test_sps = torch.load("data/test_sps.ids76.pt")
test_smile = torch.load("data/test_smile.ids68.pt")
test_log_ic50 = torch.load("data/test_ic50_log.pt")

In [7]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00001)

batch_size = 16
shuffle = True

# Assuming train_sps, train_smile, train_log_ic50 are already tensor datasets prepared for training
dataset = TensorDataset(train_sps, train_smile, train_log_ic50)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, pin_memory=True)

num_epochs = 10

In [8]:
for epoch in range(num_epochs):
    model.train()  # Ensure the model is in training mode
    total_loss = 0
    
    progress_bar = tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch+1}/{num_epochs}")
    for batch_idx, (sps, smile, log_ic50) in progress_bar:
        sps, smile, log_ic50 = sps.to(device), smile.to(device), log_ic50.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(sps, smile)
        
        loss = criterion(outputs.squeeze(), log_ic50.float())
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({'avg_loss': total_loss / (batch_idx + 1)})
    
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {total_loss / len(data_loader)}")


Epoch 1/10:   0%|          | 0/16474 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 1/10:  18%|█▊        | 2916/16474 [13:42<1:03:44,  3.54it/s, avg_loss=nan] 


KeyboardInterrupt: 

In [None]:
test_dataset = TensorDataset(test_sps, test_smile, test_log_ic50)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
model.eval()

total_loss = 0.0
total_samples = 0

with torch.no_grad():
    for sps, smile, log_ic50 in test_loader:
        sps = sps.to(device)
        smile = smile.to(device)
        log_ic50 = log_ic50.to(device)

        output = model(sps, smile)
        loss = criterion(output.squeeze(), log_ic50.float())

        total_loss += loss.item() * sps.size(0)
        total_samples += sps.size(0)

avg_loss = total_loss / total_samples

print(f"Test Average Loss: {avg_loss**0.5}")

