In [3]:
import torch
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel

# Use the correct tokenizer and model for ChemBERTa
tokenizer = RobertaTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model = RobertaModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")




In [4]:
smiles = pd.read_csv("../ZafrensData/smiles.csv")

In [5]:
smiles

Unnamed: 0,control_rx_id,bb1_id,bb2_id,bb3_id,bb4_id,SMILES
0,1,-1,-1,-1,-1,O=C1C=C(C(NCC2=CC3=C(C=C(CNCC4CCC4)N3)C=C2)=O)...
1,2,-1,-1,-1,-1,O=C(NCC1=C[N]2C=C(CNCC3CCCCC3)C=CC2=N1)C4=CC(=...
2,3,-1,-1,-1,-1,CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4C)c4nc(Cl)...
3,4,-1,-1,-1,-1,CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4S(c5ccc(C)...
4,5,-1,-1,-1,-1,CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4S(c5ccccc5...
...,...,...,...,...,...,...
14116,-1,215,195,2,1462,CC(=O)NC[C@H]1CCCN(c2ccnc(-c3cnn(CC(=O)N(C)C)c...
14117,-1,222,195,2,1462,CC(=O)NCC1CCN(c2ccnc(-c3cnn(CC(=O)N(C)C)c3)n2)CC1
14118,-1,238,195,2,1462,CC(=O)NC[C@@H]1CCN(c2ccnc(-c3cnn(CC(=O)N(C)C)c...
14119,-1,269,195,2,1462,CC(=O)NC[C@H]1CN(c2ccnc(-c3cnn(CC(=O)N(C)C)c3)...


In [6]:

columns_to_concat = ['control_rx_id', 'bb1_id', 'bb2_id', 'bb3_id', 'bb4_id']


In [7]:

smiles['sample'] = smiles[columns_to_concat].astype(str).agg('_'.join, axis=1)


In [10]:
smiles_data[1:10]

['O=C1C=C(C(NCC2=CC3=C(C=C(CNCC4CCC4)N3)C=C2)=O)N=C5C=CC=CN15',
 'O=C(NCC1=C[N]2C=C(CNCC3CCCCC3)C=CC2=N1)C4=CC(=O)N5C=CC=CC5=N4',
 'CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4C)c4nc(Cl)n3)=O)c[s]2)CC1',
 'CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4S(c5ccc(C)cc5)(=O)=O)c4ncn3)=O)c[s]2)CC1',
 'CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4S(c5ccccc5)(=O)=O)c4ncn3)=O)c[s]2)CC1',
 'CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4S(C)(=O)=O)c4ncn3)=O)c[s]2)CC1',
 'CC1CCN(Cc2nc(C(NC(CC3)CCN3c3c(cc[n]4S(C)(=O)=O)c4nc(Cl)n3)=O)c[s]2)CC1',
 'CC1CCN(CC2=NC(C(NC3CCN(C4=C5C=CN(S(=O)(C6=CC=C(C)C=C6)=O)C5=NC(Cl)=N4)CC3)=O)=CS2)CC1',
 'CC1CCN(CC2=NC(C(N[C@@H]3CCCN(C4=NC(Cl)=NC(C5=CC=CC=C5)=C4)C3)=O)=CS2)CC1',
 'CC1CCN(Cc2ccc(C(=O)N[C@@H]3CCCN(c4cc(-c5ccccc5)ncn4)C3)cc2)CC1',
 'CC1CCN(Cc2ccc(C(=O)N[C@@H]3CCN(c4cc(-c5ccccc5)ncn4)C3)cc2)CC1',
 'CC1CCN(Cc2ccc(C(=O)N[C@H]3CCCN(c4cc(-c5ccccc5)ncn4)C3)cc2)CC1',
 'CC1CCN(Cc2ccc(C(=O)N[C@H]3CCN(c4cc(-c5ccccc5)ncn4)C3)cc2)CC1',
 'CC1CCN(Cc2ccc(C(=O)NC[C@@H]3CCCN(c4cc(-c5ccccc5)ncn4)C3)cc2)CC1',


In [None]:

from concurrent.futures import ThreadPoolExecutor


In [13]:

# Example SMILES strings
smiles_data = smiles['SMILES'].tolist()[1:25]

# Function to get embeddings for a single SMILES
def get_single_embedding(smiles):
    # Tokenize the SMILES string
    inputs = tokenizer(smiles, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    
    # Get embeddings from the model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the [CLS] token embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.squeeze().numpy()

# Parallel function to get embeddings for all SMILES
def get_chemberta_embeddings_parallel(smiles_list, num_threads=4):
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Map the get_single_embedding function to each SMILES in parallel
        embeddings = list(executor.map(get_single_embedding, smiles_list))
    return embeddings

# Set the number of threads based on your system's resources
num_threads = torch.get_num_threads() if torch.cuda.is_available() else 4

# Get embeddings in parallel
embeddings = get_chemberta_embeddings_parallel(smiles_data, num_threads=num_threads)

# Convert to a DataFrame for visualization or further processing
embeddings_df = pd.DataFrame(embeddings)
print(embeddings_df.head(10))


        0         1         2         3         4         5         6    \
0  0.944714  0.505220 -0.230232 -0.148322  0.372713 -0.849105 -0.754033   
1  1.140369  0.409583 -0.193533 -0.184619  0.059042 -0.034430  1.065304   
2  1.326878 -0.101364 -0.576457 -0.130561  0.319835 -0.514043  0.198404   
3  1.060868  0.154704 -0.392269  0.054634  0.200506 -0.579806  0.403366   
4  1.615522  0.347405 -0.115857 -0.324264  0.783447 -0.218022  0.196424   
5  1.907372 -0.287846 -0.297961 -0.260376  0.881829 -0.193486  0.429274   
6  0.158611  0.541719 -0.022517 -0.450262  0.175630 -0.607023 -0.092765   
7  0.710405  0.515702 -0.138238  0.089361 -0.108771 -0.119853 -0.306733   
8  2.012888  0.585378  0.097116 -1.103715  0.677236 -0.920372  1.774989   
9  2.022439  0.594617  0.111013 -1.152915  0.716333 -0.867165  1.802899   

        7         8         9    ...       758       759       760       761  \
0 -0.767795 -0.344695 -0.501047  ...  0.945086 -0.743872  0.118435 -0.678172   
1  0.247753  0

In [17]:
import pickle as pkl

In [23]:
with open("../ZafrensData/embedded_smiles/embedded_smiles.pkl", "wb") as file:
    pkl.dump(embeddings_df, file)

In [None]:

# Example SMILES strings
smiles_data = smiles['SMILES'].tolist()

# Function to get embeddings for a batch of SMILES
def get_chemberta_embeddings(smiles_list):
    embeddings = []
    for smiles in smiles_list:
        # Tokenize the SMILES string
        inputs = tokenizer(smiles, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
        
        # Get embeddings from the model
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Extract the [CLS] token embedding (typically used as a summary representation)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        embeddings.append(cls_embedding.squeeze().numpy())

    return embeddings

# Get embeddings for the SMILES data
embeddings = get_chemberta_embeddings(smiles_data)

# Convert to a DataFrame for visualization or further processing
embeddings_df = pd.DataFrame(embeddings)
print(embeddings_df.head())