In [1]:
import torch
import torch.nn as nn
import pandas as pd
from conplex_dti.featurizer import (
    MorganFeaturizer,
    ProtBertFeaturizer,
)

from conplex_dti.model.architectures import SimpleCoembeddingNoSigmoid
from conplex_dti.model.architectures import SimpleCoembedding

In [2]:
device = "cpu"
model = SimpleCoembeddingNoSigmoid(2048, 1024, 1024)  # TODO: drug_featurizer.shape, target_featurizer.shape 2048, 1024, 1024
model.load_state_dict(torch.load("../pre_trained_model/models/ConPLex_v1_BindingDB.pt", map_location=device))

<All keys matched successfully>

In [3]:
print(model)

SimpleCoembedding(
  (drug_projector): Sequential(
    (0): Linear(in_features=2048, out_features=1024, bias=True)
    (1): ReLU()
  )
  (target_projector): Sequential(
    (0): Linear(in_features=1024, out_features=1024, bias=True)
    (1): ReLU()
  )
  (activator): Cosine()
)


In [4]:
print(model.parameters())

<generator object Module.parameters at 0x1378e0350>


In [5]:
for param in model.parameters():
  # print(param.data)
  print(param)


Parameter containing:
tensor([[ 0.0047, -0.0079,  0.0238,  ...,  0.0111,  0.0210,  0.0207],
        [ 0.0153,  0.0114, -0.0300,  ...,  0.0068, -0.0462, -0.0392],
        [ 0.0016,  0.0115, -0.0180,  ..., -0.0070, -0.0049, -0.0085],
        ...,
        [ 0.0366, -0.0195,  0.0033,  ...,  0.0002, -0.0111, -0.0228],
        [ 0.0404,  0.0027,  0.0098,  ..., -0.0241, -0.0017, -0.0129],
        [-0.0063,  0.0096, -0.0328,  ...,  0.0028,  0.0066, -0.0292]],
       requires_grad=True)
Parameter containing:
tensor([-0.0171, -0.0019, -0.0151,  ..., -0.0408, -0.0276, -0.0064],
       requires_grad=True)
Parameter containing:
tensor([[-0.0145, -0.0040,  0.0165,  ...,  0.0232,  0.0239, -0.0087],
        [ 0.0004,  0.0061, -0.0367,  ..., -0.0063, -0.0146, -0.0555],
        [-0.0189, -0.0549,  0.0328,  ..., -0.0027, -0.0060,  0.0128],
        ...,
        [ 0.0331, -0.0179, -0.0203,  ...,  0.0261, -0.0122, -0.0455],
        [ 0.0083, -0.0030,  0.0434,  ...,  0.0112,  0.0150, -0.0119],
        [-0.01

In [6]:
# TODO: eventually move TransferCoembedding class to architectures.py
class Cosine(nn.Module):
    def forward(self, x1, x2):
        return nn.CosineSimilarity()(x1, x2)


DISTANCE_METRICS = {
    "Cosine": Cosine,
}
ACTIVATIONS = {"ReLU": nn.ReLU }


In [7]:
import copy
# Create a new model that has 2 layers of tensors instead of 1.
class TransferCoembedding(nn.Module):
    def __init__(
        self,
        pre_trained_model:SimpleCoembedding,
        drug_shape=2048,
        target_shape=1024,
        latent_dimension=1024,
        latent_activation="ReLU",
        latent_distance="Cosine",
        classify=True,
    ):
        super().__init__()
        # TODO: initialize these all baased on the pre-trained model
        self.drug_shape = drug_shape
        self.target_shape = target_shape
        self.latent_dimension = latent_dimension
        self.do_classify = classify
        self.latent_activation = ACTIVATIONS[latent_activation]

        self.drug_projector = nn.Sequential(
            nn.Linear(self.drug_shape, latent_dimension),     # [0]
            self.latent_activation(),                         # [1]
            # ADD AN ADDITIONAL LAYER AND ACTIVATION FUNCTION
            nn.Linear(latent_dimension, latent_dimension),    # [2]
            self.latent_activation()                          # [3]
        )

        # initialize layer 0 from pre-trained model:
        self.drug_projector[0] = copy.deepcopy(pre_trained_model.drug_projector[0])
        # initialize layer 2 randomly
        nn.init.xavier_normal_(self.drug_projector[2].weight)

        self.target_projector = nn.Sequential(
            nn.Linear(self.target_shape, latent_dimension),
            self.latent_activation(),
            # ADD AN ADDITIONAL LAYER AND ACTIVATION FUNCTION
            nn.Linear(latent_dimension, latent_dimension),
            self.latent_activation()
        )

        # initialize layer 0 from pre-trained model:
        self.target_projector[0] = copy.deepcopy(pre_trained_model.target_projector[0])
        # initialize layer 2 randomly
        nn.init.xavier_normal_(self.target_projector[2].weight)

        # freeze the first layers of the target and drug projectors
        for idx, param in enumerate(self.parameters()):
            if idx == 0 or idx == 4:
                param.requires_grad = False


        if self.do_classify: # if True:
            self.distance_metric = latent_distance # "Cosine"
            self.activator = DISTANCE_METRICS[self.distance_metric]() # gives it the Cosine activator function that was written

    def forward(self, drug, target):
        if self.do_classify: # if True:
            return self.classify(drug, target)
        else:
            return self.regress(drug, target)

    def classify(self, drug, target):
        drug_projection = self.drug_projector(drug)
        target_projection = self.target_projector(target)

        distance = self.activator(drug_projection, target_projection)
        return distance.squeeze()

    def regress(self, drug, target):
        drug_projection = self.drug_projector(drug)
        target_projection = self.target_projector(target)

        inner_prod = torch.bmm(
            drug_projection.view(-1, 1, self.latent_dimension),
            target_projection.view(-1, self.latent_dimension, 1),
        ).squeeze()
        return inner_prod.squeeze()


In [8]:
new_model = TransferCoembedding(model)


In [9]:
print(new_model)

TransferCoembedding(
  (drug_projector): Sequential(
    (0): Linear(in_features=2048, out_features=1024, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1024, out_features=1024, bias=True)
    (3): ReLU()
  )
  (target_projector): Sequential(
    (0): Linear(in_features=1024, out_features=1024, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1024, out_features=1024, bias=True)
    (3): ReLU()
  )
  (activator): Cosine()
)


In [10]:
for param in new_model.parameters():
    print(param)

Parameter containing:
tensor([[ 0.0047, -0.0079,  0.0238,  ...,  0.0111,  0.0210,  0.0207],
        [ 0.0153,  0.0114, -0.0300,  ...,  0.0068, -0.0462, -0.0392],
        [ 0.0016,  0.0115, -0.0180,  ..., -0.0070, -0.0049, -0.0085],
        ...,
        [ 0.0366, -0.0195,  0.0033,  ...,  0.0002, -0.0111, -0.0228],
        [ 0.0404,  0.0027,  0.0098,  ..., -0.0241, -0.0017, -0.0129],
        [-0.0063,  0.0096, -0.0328,  ...,  0.0028,  0.0066, -0.0292]])
Parameter containing:
tensor([-0.0171, -0.0019, -0.0151,  ..., -0.0408, -0.0276, -0.0064],
       requires_grad=True)
Parameter containing:
tensor([[-0.0318, -0.0216, -0.0370,  ...,  0.0173,  0.0011,  0.0305],
        [-0.0140, -0.0610, -0.0161,  ...,  0.0248,  0.0046, -0.0195],
        [-0.0015, -0.0444,  0.0276,  ..., -0.0216,  0.0181,  0.0128],
        ...,
        [ 0.0413,  0.0315,  0.0131,  ...,  0.0262, -0.0524,  0.0279],
        [-0.0644,  0.0080,  0.0644,  ...,  0.0082, -0.0271, -0.0255],
        [-0.0073,  0.0205, -0.0093,  ...,

there are 2 ways to apply transfer learning.

1. Give the model the new images and allow it to train with a lower learning rate
2. take the given model and freeze all layers except for the last, output layer, then retrain

we are going to adopt method 2., since our model has only one layer, we are adding a 2nd layer that is randomly initialized

NOTES:
- will get bitter data from paper and give binding vs not binding for bitter molecules and receptors so we can do binary training
- still unsure on how we will do sweet molecules / address the sweet receptor
- other people are also interested in this, so we could potentially contribute it back to sam
- we could also apply method 1 as well

In [11]:
# TODO:


# - use constrative dataset as input for two models: one that is the original with a lower learning rate,
#   the ohter model that is our creation



## Load Data:

In [None]:
# create a DTIDataModule (from conplex_dti/dataset/datamodules.py)



In [66]:
# import pytorch_lightning as pl
# from conplex_dti.featurizer import Featurizer
# import typing as T
# from pathlib import Path


# def drug_target_collate_fn(args: T.Tuple[torch.Tensor, torch.Tensor, torch.Tensor]):
#     """
#     Collate function for PyTorch data loader -- turn a batch of triplets into a triplet of batches

#     If target embeddings are not all the same length, it will zero pad them
#     This is to account for differences in length from FoldSeek embeddings

#     :param args: Batch of training samples with molecule, protein, and affinity
#     :type args: Iterable[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]
#     :return: Create a batch of examples
#     :rtype: T.Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
#     """
#     d_emb = [a[0] for a in args]
#     t_emb = [a[1] for a in args]
#     labs = [a[2] for a in args]

#     drugs = torch.stack(d_emb, 0)
#     targets = pad_sequence(t_emb, batch_first=True, padding_value=FOLDSEEK_MISSING_IDX)
#     labels = torch.stack(labs, 0)

#     return drugs, targets, labels


# class DTIDataModule(pl.LightningDataModule):
#     def __init__(
#         self,
#         data_dir: str,
#         drug_featurizer: Featurizer,
#         target_featurizer: Featurizer,
#         device: torch.device = torch.device("cpu"),
#         batch_size: int = 32,
#         shuffle: bool = True,
#         num_workers: int = 0,
#         header=0,
#         index_col=0,
#         sep=",",
#     ):
#         self._loader_kwargs = {
#             "batch_size": batch_size,
#             "shuffle": shuffle,
#             "num_workers": num_workers,
#             "collate_fn": drug_target_collate_fn,
#         }

#         self._csv_kwargs = {
#             "header": header,
#             "index_col": index_col,
#             "sep": sep,
#         }

#         self._device = device

#         self._data_dir = Path(data_dir)
#         self._train_path = Path("train.csv")
#         self._val_path = Path("val.csv")
#         self._test_path = Path("test.csv")

#         self._drug_column = "SMILES"
#         self._target_column = "Target Sequence"
#         self._label_column = "Label"

#         self.drug_featurizer = drug_featurizer
#         self.target_featurizer = target_featurizer

#     def prepare_data(self):
#         if self.drug_featurizer.path.exists() and self.target_featurizer.path.exists():
#             logg.warning("Drug and target featurizers already exist")
#             return

#         df_train = pd.read_csv(self._data_dir / self._train_path, **self._csv_kwargs)

#         df_val = pd.read_csv(self._data_dir / self._val_path, **self._csv_kwargs)

#         df_test = pd.read_csv(self._data_dir / self._test_path, **self._csv_kwargs)

#         dataframes = [df_train, df_val, df_test]

#         all_drugs = pd.concat([i[self._drug_column] for i in dataframes]).unique()


#         all_targets = pd.concat([i[self._target_column] for i in dataframes]).unique()

#         if self._device.type == "cuda":
#             self.drug_featurizer.cuda(self._device)
#             self.target_featurizer.cuda(self._device)

#         if not self.drug_featurizer.path.exists():
#             self.drug_featurizer.write_to_disk(all_drugs)

#         if not self.target_featurizer.path.exists():
#             self.target_featurizer.write_to_disk(all_targets)

#         self.drug_featurizer.cpu()
#         self.target_featurizer.cpu()

#     def setup(self, stage: T.Optional[str] = None):
#         self.df_train = pd.read_csv(
#             self._data_dir / self._train_path, **self._csv_kwargs
#         )

#         self.df_val = pd.read_csv(self._data_dir / self._val_path, **self._csv_kwargs)

#         self.df_test = pd.read_csv(self._data_dir / self._test_path, **self._csv_kwargs)

#         self._dataframes = [self.df_train, self.df_val, self.df_test]

#         all_drugs = pd.concat([i[self._drug_column] for i in self._dataframes]).unique()
#         all_targets = pd.concat(
#             [i[self._target_column] for i in self._dataframes]
#         ).unique()

#         if self._device.type == "cuda":
#             self.drug_featurizer.cuda(self._device)
#             self.target_featurizer.cuda(self._device)

#         self.drug_featurizer.preload(all_drugs)
#         self.drug_featurizer.cpu()

#         self.target_featurizer.preload(all_targets)
#         self.target_featurizer.cpu()

#         if stage == "fit" or stage is None:
#             self.data_train = BinaryDataset(
#                 self.df_train[self._drug_column],
#                 self.df_train[self._target_column],
#                 self.df_train[self._label_column],
#                 self.drug_featurizer,
#                 self.target_featurizer,
#             )

#             self.data_val = BinaryDataset(
#                 self.df_val[self._drug_column],
#                 self.df_val[self._target_column],
#                 self.df_val[self._label_column],
#                 self.drug_featurizer,
#                 self.target_featurizer,
#             )

#         if stage == "test" or stage is None:
#             self.data_test = BinaryDataset(
#                 self.df_test[self._drug_column],
#                 self.df_test[self._target_column],
#                 self.df_test[self._label_column],
#                 self.drug_featurizer,
#                 self.target_featurizer,
#             )

#     def train_dataloader(self):
#         return DataLoader(self.data_train, **self._loader_kwargs)

#     def val_dataloader(self):
#         return DataLoader(self.data_val, **self._loader_kwargs)

#     def test_dataloader(self):
#         return DataLoader(self.data_test, **self._loader_kwargs)


In [70]:
from conplex_dti.dataset.datamodules import DTIDataModule

In [72]:
# drug_featurizer = MorganFeaturizer()
# target_featurizer = ProtBertFeaturizer()
# our_data = DTIDataModule(data_dir = "./data/", drug_featurizer=drug_featurizer, target_featurizer=target_featurizer, batch_size = 4)

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [75]:
drug_featurizer = MorganFeaturizer()
target_featurizer = ProtBertFeaturizer()
our_data = DTIDataModule(data_dir = "./data/", drug_featurizer=drug_featurizer, target_featurizer=target_featurizer, batch_size = 4)

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [79]:
our_data.prepare_data()
our_data.setup()

Drug and target featurizers already exist
Morgan: 100%|██████████| 4/4 [00:00<00:00, 2465.79it/s]
ProtBert: 100%|██████████| 2/2 [00:00<00:00, 3036.05it/s]


In [None]:
our_data.setup()

NotADirectoryError: [Errno 20] Not a directory: 'bitter_training_data.csv/train.csv'

## Train the (new) model:


- since the sate we are working 

In [None]:
# sam's function for gettina prediction and label

def step(model, batch, device=None):
    # if device is None:
    #     device = torch.device("cpu")

    drug, target, label = batch  # target is (D + N_pool)
    pred = model(drug.to(device), target.to(device))
    label = Variable(torch.from_numpy(np.array(label)).float()).to(device)
    return pred, label



In [None]:
# test the step function:
