In [1]:
import torch
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel

# Use the correct tokenizer and model for ChemBERTa
tokenizer = RobertaTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model = RobertaModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")


2025-02-24 17:00:43.049825: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-24 17:00:43.276309: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740445243.373570  101073 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740445243.400879  101073 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-24 17:00:43.622390: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
smiles = pd.read_csv("../ZafrensData/smiles.csv")

In [3]:

columns_to_concat = ['control_rx_id', 'bb1_id', 'bb2_id', 'bb3_id', 'bb4_id']


In [4]:

smiles['sample'] = smiles[columns_to_concat].astype(str).agg('_'.join, axis=1)


In [10]:
smiles_data[1:10]

['O=C1C=C(C(NCC2=CC3=C(C=C(CNCC4CCC4)N3)C=C2)=O)N=C5C=CC=CN15',
 'O=C(NCC1=C[N]2C=C(CNCC3CCCCC3)C=CC2=N1)C4=CC(=O)N5C=CC=CC5=N4',
 'CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4C)c4nc(Cl)n3)=O)c[s]2)CC1',
 'CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4S(c5ccc(C)cc5)(=O)=O)c4ncn3)=O)c[s]2)CC1',
 'CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4S(c5ccccc5)(=O)=O)c4ncn3)=O)c[s]2)CC1',
 'CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4S(C)(=O)=O)c4ncn3)=O)c[s]2)CC1',
 'CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4S(C)(=O)=O)c4nc(Cl)n3)=O)c[s]2)CC1',
 'CC1CCN(CC2=NC(C(NC3CCN(C4=C5C=CN(S(=O)(C6=CC=C(C)C=C6)=O)C5=NC(Cl)=N4)CC3)=O)=CS2)CC1',
 'CC1CCN(CC2=NC(C(N[C@@H]3CCCN(C4=NC(Cl)=NC(C5=CC=CC=C5)=C4)C3)=O)=CS2)CC1',
 'CC1CCN(Cc2ccc(C(=O)N[C@@H]3CCCN(c4cc(-c5ccccc5)ncn4)C3)cc2)CC1',
 'CC1CCN(Cc2ccc(C(=O)N[C@@H]3CCN(c4cc(-c5ccccc5)ncn4)C3)cc2)CC1',
 'CC1CCN(Cc2ccc(C(=O)N[C@H]3CCCN(c4cc(-c5ccccc5)ncn4)C3)cc2)CC1',
 'CC1CCN(Cc2ccc(C(=O)N[C@H]3CCN(c4cc(-c5ccccc5)ncn4)C3)cc2)CC1',
 'CC1CCN(Cc2ccc(C(=O)NC[C@@H]3CCCN(c4cc(-c5ccccc5)ncn4)C3)cc2)CC1',


In [6]:

from concurrent.futures import ThreadPoolExecutor


In [8]:

# Example SMILES strings
smiles_data = smiles['SMILES'].tolist()[1:25]

# Function to get embeddings for a single SMILES
def get_single_embedding(smiles):
    # Tokenize the SMILES string
    inputs = tokenizer(smiles, return_tensors="pt", max_length=256, truncation=True, padding="max_length")
    
    # Get embeddings from the model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the [CLS] token embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.squeeze().numpy()

# Parallel function to get embeddings for all SMILES
def get_chemberta_embeddings_parallel(smiles_list, num_threads=4):
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Map the get_single_embedding function to each SMILES in parallel
        embeddings = list(executor.map(get_single_embedding, smiles_list))
    return embeddings

# Set the number of threads based on your system's resources
num_threads = torch.get_num_threads() if torch.cuda.is_available() else 4

# Get embeddings in parallel
embeddings = get_chemberta_embeddings_parallel(smiles_data, num_threads=num_threads)

# Convert to a DataFrame for visualization or further processing
embeddings_df = pd.DataFrame(embeddings)
print(embeddings_df.head(10))


        0         1         2         3         4         5         6    \
0  0.944714  0.505220 -0.230232 -0.148323  0.372712 -0.849104 -0.754033   
1  1.140368  0.409582 -0.193531 -0.184619  0.059040 -0.034431  1.065303   
2  1.326877 -0.101364 -0.576457 -0.130561  0.319834 -0.514042  0.198406   
3  1.060867  0.154706 -0.392268  0.054633  0.200505 -0.579807  0.403367   
4  1.615521  0.347405 -0.115856 -0.324263  0.783446 -0.218023  0.196425   
5  1.907373 -0.287844 -0.297961 -0.260377  0.881829 -0.193487  0.429274   
6  0.158611  0.541719 -0.022517 -0.450263  0.175629 -0.607023 -0.092765   
7  0.710406  0.515701 -0.138238  0.089361 -0.108771 -0.119853 -0.306732   
8  2.012888  0.585378  0.097116 -1.103713  0.677234 -0.920373  1.774990   
9  2.022439  0.594617  0.111014 -1.152916  0.716333 -0.867165  1.802900   

        7         8         9    ...       758       759       760       761  \
0 -0.767794 -0.344695 -0.501047  ...  0.945085 -0.743872  0.118436 -0.678172   
1  0.247752  0

In [17]:
import pickle as pkl

In [23]:
with open("../ZafrensData/embedded_smiles/embedded_smiles.pkl", "wb") as file:
    pkl.dump(embeddings_df, file)

In [None]:

# Example SMILES strings
smiles_data = smiles['SMILES'].tolist()

# Function to get embeddings for a batch of SMILES
def get_chemberta_embeddings(smiles_list):
    embeddings = []
    for smiles in smiles_list:
        # Tokenize the SMILES string
        inputs = tokenizer(smiles, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
        
        # Get embeddings from the model
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Extract the [CLS] token embedding (typically used as a summary representation)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        embeddings.append(cls_embedding.squeeze().numpy())

    return embeddings

# Get embeddings for the SMILES data
embeddings = get_chemberta_embeddings(smiles_data)

# Convert to a DataFrame for visualization or further processing
embeddings_df = pd.DataFrame(embeddings)
print(embeddings_df.head())