# Calculate chemberta features from pretrained huggingface model and add feature info to local AMPL install
- `pip install transformers`


In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
model = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-77M-MTR", output_hidden_states=True)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import pandas as pd

sl=pd.read_csv("/Users/echun/repos/DILI-rotation/datasets_ST/sl_test.csv")
sl

Unnamed: 0,bucket_name,dataset_key,dataset_name,response_cols,collection,scaffold_split_uuid
0,public,/Users/echun/repos/DILI-rotation/AMPL_models/d...,MMP,active,SUP,8507d7de-0beb-4c8b-9f3b-26f7d34293d3


In [3]:
import os
import torch
import torch.nn.functional as F
import pandas as pd
from tqdm import tqdm

BATCH_SIZE = 8  # safer for CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

for i, row in sl.iterrows():
    output_path = f"/Users/echun/repos/DILI-rotation/datasets_ST/MMP/scaled_descriptors/{row.dataset_name}_with_chemberta_descriptors.csv"
    
    if os.path.exists(output_path):
        continue
    
    data = pd.read_csv(row.dataset_key)
    smiles = data.base_rdkit_smiles.tolist()

    all_embeddings = []

    for start in tqdm(range(0, len(smiles), BATCH_SIZE)):
        batch = smiles[start:start+BATCH_SIZE]
        tokens = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model(**tokens)

            # your model does NOT have last_hidden_state
            # use last layer of hidden_states instead
            emb = outputs.hidden_states[-1][:, 0, :]   # CLS token
            emb = F.normalize(emb, p=2, dim=1)

        all_embeddings.append(emb.cpu())

    embeddings = torch.cat(all_embeddings, dim=0)

    cbfeat = pd.DataFrame(
        embeddings.numpy(),
        columns=[f"cbert_{i}" for i in range(embeddings.shape[1])]
    )
    cbfeat.insert(0, "compound_id", data.compound_id.values)

    out = data.merge(cbfeat, on="compound_id")
    out.to_csv(output_path, index=False)


100%|██████████| 696/696 [00:04<00:00, 143.88it/s]


In [3]:
import os
import torch.nn.functional as F
for i, row in sl.iterrows():
    output_path=f"../AMPL_models/datasets/missing_actives/scaled_descriptors/{row.dataset_name}_with_chemberta_descriptors.csv"
    
    if os.path.exists(output_path):
        continue
    
    data=pd.read_csv(row.dataset_key)
    smiles=data.base_rdkit_smiles.tolist()

    tokens = tokenizer(smiles, padding=True, truncation=True, return_tensors="pt")

    outputs = model(**tokens)

    print(outputs.hidden_states[-1].size())

    embeddings = outputs.hidden_states[-1][:, 0, :].detach() # only take the [CLS] token
    embeddings = F.normalize(embeddings, p=2, dim=1)
    print(embeddings.size())

    feat_dict = {}
    for idx, row in data.iterrows():
        feat_dict[row.compound_id] = embeddings[idx]

    cbfeat=pd.DataFrame(feat_dict).T
    cbfeat.columns=[f"cbert_{i}" for i in range(384)]
    cbfeat=cbfeat.reset_index(names="compound_id")

    data=data.merge(cbfeat, on="compound_id")
    data.to_csv(output_path, index=False)

torch.Size([566, 512, 384])
torch.Size([566, 384])


In [5]:
## Add chemberta descriptors to your AMPL repo here

In [6]:
cbfeat=pd.DataFrame(feat_dict).T
cbfeat.columns=[f"cbert_{i}" for i in range(384)]
cbfeat=cbfeat.reset_index(names="compound_id")
cbfeat

NameError: name 'feat_dict' is not defined

In [7]:
data=data.merge(cbfeat, on="compound_id")
# rdkit
data.to_csv("/Users/echun/repos/DILI-rotation/datasets/MMP_tox21_curC.csv", index=False)

In [None]:
data = pd.read_csv("/Users/echun/repos/DILI-rotation/datasets/MMP_tox21_curC.csv")
data = data.rename(columns=lambda c: c.replace("_x", "") if c.startswith("cbert_") else c)

# Save back out
data.to_csv("/Users/echun/repos/DILI/datasets/ROS_hits_up_class_cur3.csv", index=False)

In [8]:
# open the descriptors file in your main ampl repo
desc=pd.read_csv("/Users/echun/repos/AMPL/atomsci/ddm/data/descriptor_sets_sources_by_descr_type.csv")
desc

Unnamed: 0,descr_type,descriptors,scaled,source
0,moe_raw,ASA;ASA+;ASA-;ASA_H;ASA_P;ast_fraglike;ast_fra...,0,moe
1,moe_norm,ASA+_per_atom;ASA-;ASA_H_per_atom;ASA_P;ASA_pe...,1,moe
2,moe,ASA+_per_atom;ASA-;ASA_H_per_atom;ASA_P;ASA_pe...,1,moe
3,moe_filtered,ASA;ASA+;ASA-;ASA_H;ASA_P;BCUT_PEOE_0;BCUT_PEO...,0,moe
4,moe_scaled,ASA+_per_atom;ASA-;ASA_H_per_atom;ASA_P;ASA_pe...,1,moe
5,moe_scaled_filtered,ASA+_per_atom;ASA-;ASA_H_per_atom;ASA_P;ASA_pe...,1,moe
6,moe_informative,ASA+;ASA-;ASA_H;ASA_P;ASA;BCUT_PEOE_0;BCUT_PEO...,0,moe
7,mordred_raw,ABC;ABCGG;nAcid;nBase;SpAbs_A;SpMax_A;SpDiam_A...,0,mordred
8,mordred_filtered,AATS0Z;AATS0are;AATS0d;AATS0dv;AATS0i;AATS0m;A...,0,mordred
9,rdkit_raw,MaxEStateIndex;MinEStateIndex;MaxAbsEStateInde...,0,rdkit


In [None]:
# make a new row with the chemberta descriptors
d2=pd.DataFrame({
    "descr_type":"chemberta",
    "descriptors":";".join(cbfeat.columns[1:].tolist()),
    "scaled":0,
    "source":"chemberta",
}, index=[0])
d2


In [None]:
# add to bottom of the descriptors file
desc=pd.concat([desc,d2])
desc

In [38]:
desc=desc.reset_index(drop=True)
desc.to_csv("/Users/echun/repos/AMPL/atomsci/ddm/data/descriptor_sets_sources_by_descr_type.csv", index=False)