In [50]:
from sklearn.metrics import average_precision_score, roc_auc_score
import wandb

from datasets import load_dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, RobertaModel
from transformers import TrainingArguments, Trainer, IntervalStrategy

import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader
from torch import nn
import torch.nn.functional as F

from tqdm.auto import tqdm

here we choose the pretained model to extract the transformer embedding from

In [2]:
pretrained_path = "seyonec/PubChem10M_SMILES_BPE_450k"

loading model

In [6]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
model = RobertaModel.from_pretrained(pretrained_path, num_labels=2, add_pooling_layer=True,
                                                           id2label={0: 'Not Withdrawn', 1:'Withdrawn'},
                                                           label2id={'Not Withdrawn': 0, 'Withdrawn': 1})

Some weights of the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


this function iterates over all the rows in the dataframe
and for each SMILES string it passes it through the model to get the last hidden state
the last hidden states are the final representation the model extract for each token in the data before using it for the different classification task

it's common in BERT-like models to take the first tokens as the "pooled" token that is used to represent the entire string (in the regular BERT model this is the \[CLS\] token).

then we create a DataFrame each row containing the original SMILES and column for each feature in the pooled token vector representation

In [71]:
def get_embeddings(df):
    embedding_df = pd.DataFrame(columns=['smiles'] + [f'ChemBERTa_emb_{i}' for i in range(768)])
    
    for index, row in tqdm(df.iterrows(), leave=False):
        encodings = tokenizer(row.smiles, return_tensors='pt')
        with torch.no_grad():
            output = model(**encodings)
            smiles_embeddings = output.last_hidden_state[0, 0, :]

        dic = {**{'smiles': row.smiles}, **dict(zip([f'ChemBERTa_emb_{i}' for i in range(768)], output.last_hidden_state[0, 0, :].numpy().tolist()))}

        embedding_df.loc[len(embedding_df)] = pd.Series(dic)
        
    return embedding_df

here we iterate over all our data and create embedding for each of the training and testing files
and saving it.

In [72]:
for split in tqdm(['db_no_agree_no_dups', 'db_agree_no_dups']):
    for dataset in tqdm(['ChEMBL', 'DrugBank', 'NCATS'], leave=False):
        df_train = pd.read_csv(f'./split/{split}/{dataset}/train.csv')

        df_test = pd.read_csv(f'./split/{split}/{dataset}/test.csv')
    
        train_embeddings = get_embeddings(df_train)
        test_embeddings = get_embeddings(df_test)
        
        train_embeddings.to_csv(f'./split/{split}/{dataset}/ChemBERTa_embedding_train.csv')
        
        test_embeddings.to_csv(f'./split/{split}/{dataset}/ChemBERTa_embedding_test.csv')

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]