### Drug Uni-Mol embedding

In [4]:
import io
import sys
import sqlite3
import numpy as np
import pandas as pd
from tqdm import tqdm

# Add the path of 'unimol_tools' to Python's search path
sys.path.append('./Uni-Mol-main/unimol_tools/unimol_tools')
from unimol_tools import UniMolRepr

# =========================
# 1. Database operations
# =========================

def create_table(conn):
    """
    Create (if it does not exist) the table 'molecule_atomic_embeddings' 
    in the database to store molecular representations.
    :param conn: sqlite3.Connection object.
    """
    conn.execute('''
        CREATE TABLE IF NOT EXISTS molecule_atomic_embeddings (
            smiles TEXT PRIMARY KEY,
            embedding BLOB,
            atomic_embedding BLOB
        )
    ''')
    conn.commit()

def store_molecule_embedding(conn, smiles, embedding, atomic_embedding):
    """
    Store the UniMol CLS-level representation and atomic-level representation
    of a molecule into the database.
    :param conn: sqlite3.Connection object.
    :param smiles: String, SMILES of the molecule.
    :param embedding: ndarray, the CLS (global) representation of the molecule.
    :param atomic_embedding: ndarray, the atomic-level representations of the molecule.
    """
    embedding_buffer = io.BytesIO()
    atomic_buffer = io.BytesIO()

    # Save the numpy arrays into in-memory buffers
    np.save(embedding_buffer, embedding)
    np.save(atomic_buffer, atomic_embedding)
    embedding_buffer.seek(0)
    atomic_buffer.seek(0)

    # Store them in the database as BLOBs
    conn.execute('''
        INSERT OR REPLACE INTO molecule_atomic_embeddings (smiles, embedding, atomic_embedding)
        VALUES (?, ?, ?)
    ''', (
        smiles,
        sqlite3.Binary(embedding_buffer.read()),
        sqlite3.Binary(atomic_buffer.read())
    ))
    conn.commit()

def get_molecule_embedding(conn, smiles):
    """
    Retrieve the UniMol CLS-level representation and atomic-level representation
    for a given SMILES from the database.
    :param conn: sqlite3.Connection object.
    :param smiles: String, SMILES of the molecule.
    :return: A tuple (embedding, atomic_embedding); returns (None, None) if not found.
    """
    cursor = conn.execute('''
        SELECT embedding, atomic_embedding
        FROM molecule_atomic_embeddings
        WHERE smiles=?
    ''', (smiles,))
    result = cursor.fetchone()

    if result:
        # result[0] -> embedding, result[1] -> atomic_embedding
        return (
            np.load(io.BytesIO(result[0]), allow_pickle=True),
            np.load(io.BytesIO(result[1]), allow_pickle=True)
        )
    else:
        return None, None

# =========================
# 2. Representation generation
# =========================

def process_smiles_batch(smiles_batch):
    """
    Process a batch of SMILES strings, returning a list of (SMILES, CLS representation, atomic representation).
    Use only for small batches or when needed; for large datasets, consider a single initialization of UniMolRepr.
    :param smiles_batch: List of SMILES strings.
    :return: A list of tuples (smiles, cls_repr, atomic_reprs).
    """
    try:
        # This uses CPU mode by default; enable use_gpu=True if needed
        clf = UniMolRepr(data_type='molecule', remove_hs=True)
        unimol_repr = clf.get_repr(smiles_batch, return_atomic_reprs=True)

        results = []
        for i, smiles in enumerate(smiles_batch):
            cls_repr = np.array(unimol_repr['cls_repr'][i])       # CLS representation
            atomic_reprs = np.array(unimol_repr['atomic_reprs'][i])  # Atomic-level representation
            results.append((smiles, cls_repr, atomic_reprs))
        return results
    except Exception as e:
        print(f"Failed to process batch: {e}")
        return [None] * len(smiles_batch)

def process_smiles_data(conn, smiles_list, batch_size=5000):
    """
    Process the given list of SMILES in batches, generate UniMol representations, 
    and store them in the database.
    :param conn: sqlite3.Connection object.
    :param smiles_list: Iterable of SMILES strings.
    :param batch_size: Integer, max number of SMILES to process in one batch. Adjust based on available memory.
    """
    # Initialize UniMolRepr once (enable GPU if memory allows)
    clf = UniMolRepr(data_type='molecule', remove_hs=True, use_gpu=True)

    # Remove duplicates by converting to a set
    unique_smiles_set = set(smiles_list)

    # Collect SMILES strings not yet in the database
    new_smiles_list = []
    for smiles in tqdm(unique_smiles_set, desc="Checking existing SMILES"):
        # If the SMILES is not in the database, we include it for processing
        if get_molecule_embedding(conn, smiles)[0] is None:
            new_smiles_list.append(smiles)

    # Process and store in batches
    for i in tqdm(range(0, len(new_smiles_list), batch_size), desc="Processing new SMILES in batches"):
        batch = new_smiles_list[i : i+batch_size]
        try:
            unimol_repr = clf.get_repr(batch, return_atomic_reprs=True)
        except Exception as e:
            print(f"Failed to process batch: {e}")
            continue

        for j, smiles in enumerate(batch):
            cls_repr = np.array(unimol_repr['cls_repr'][j])
            atomic_reprs = np.array(unimol_repr['atomic_reprs'][j])
            store_molecule_embedding(conn, smiles, cls_repr, atomic_reprs)

# =========================
# 3. Main script
# =========================

if __name__ == "__main__":
    # Connect to the database
    conn = sqlite3.connect('./data/11betaHSD1/Uni-Mol_molecule_embeddings_no_h_11betaHSD1_ligands.db')
    
    # Create the table (if it does not exist)
    create_table(conn)

    # Load SMILES from a CSV file
    df = pd.read_csv("./data/11betaHSD1/11betaHSD1.csv")
    smiles_data = df["SMILES"].tolist()

    # Process SMILES in batches and store to the database
    process_smiles_data(conn, smiles_data, batch_size=5000)

    # Close the database connection
    conn.close()


2025-03-25 13:46:04 | unimol_tools/models/unimol.py | 146 | INFO | Uni-Mol(QSAR) | Loading pretrained weights from /mnt/USR_DATA/ChenGeng/anaconda3/envs/chemprop/lib/python3.8/site-packages/unimol_tools-1.0.0-py3.8.egg/unimol_tools/weights/mol_pre_no_h_220816.pt
Checking existing SMILES: 100%|██████████| 1240/1240 [00:00<00:00, 100981.22it/s]
Processing new SMILES in batches:   0%|          | 0/1 [00:00<?, ?it/s]2025-03-25 13:46:14 | unimol_tools/data/conformer.py | 90 | INFO | Uni-Mol(QSAR) | Start generating conformers...
1240it [00:12, 101.02it/s]
2025-03-25 13:46:27 | unimol_tools/data/conformer.py | 94 | INFO | Uni-Mol(QSAR) | Failed to generate conformers for 0.00% of molecules.
2025-03-25 13:46:27 | unimol_tools/data/conformer.py | 96 | INFO | Uni-Mol(QSAR) | Failed to generate 3d conformers for 0.24% of molecules.
100%|██████████| 39/39 [00:11<00:00,  3.34it/s]
Processing new SMILES in batches: 100%|██████████| 1/1 [02:46<00:00, 166.83s/it]


### Ankh Large protein embedding

In [1]:
# =========================
# 1. Imports and Environment Setup
# =========================

import os
import gc
import io
import torch
import sqlite3
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel


# Select device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# =========================
# 2. Function Definitions
# =========================

def read_data_in_chunks(file_path, chunk_size):
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        yield chunk

def create_table(conn):
    conn.execute('''
    CREATE TABLE IF NOT EXISTS protein_embeddings (
        protein_sequence TEXT PRIMARY KEY,
        embedding BLOB
    )
    ''')
    conn.commit()

def store_embedding(conn, protein_sequence, embedding):
    buffer = io.BytesIO()
    np.save(buffer, embedding.cpu().numpy())
    buffer.seek(0)
    conn.execute('''
    INSERT OR REPLACE INTO protein_embeddings (protein_sequence, embedding)
    VALUES (?, ?)
    ''', (protein_sequence, sqlite3.Binary(buffer.read())))
    conn.commit()

def get_protein_embedding(conn, protein_sequence):
    cursor = conn.execute('''
    SELECT embedding FROM protein_embeddings WHERE protein_sequence=?
    ''', (protein_sequence,))
    result = cursor.fetchone()
    if result:
        return np.load(io.BytesIO(result[0]), allow_pickle=True)
    return None

# =========================
# 3. Main Execution
# =========================

if __name__ == "__main__":
    # Define paths and parameters
    # Path to the pre-trained Ankh model 
    model_path = "./model/Ankh_Large/Ankh_Large_model.pth"
    data_path = "./data/11betaHSD1/11betaHSD1.csv"
    db_path = "./data/11betaHSD1/Ankh_Large_target_protein_embeddings_11betaHSD1.db"
    max_length = 1200
    chunk_size = 500

    # Load tokenizer and model
    config = AutoConfig.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = T5EncoderModel.from_pretrained(model_path, config=config)
    model.to(device)
    model.eval()

    # Connect to the database
    conn = sqlite3.connect(db_path)
    create_table(conn)

    # Process protein sequences in chunks
    for df_chunk in read_data_in_chunks(data_path, chunk_size):
        for i in tqdm(range(len(df_chunk)), desc="Processing protein sequences"):
            original_seq = df_chunk.iloc[i]['Protein']

            if get_protein_embedding(conn, original_seq) is not None:
                continue

            seq = original_seq[:1200] if len(original_seq) > 1200 else original_seq
            tokenized = tokenizer.batch_encode_plus(
                [list(seq)],
                add_special_tokens=True,
                padding=True,
                is_split_into_words=True,
                return_tensors="pt"
            )

            input_ids = tokenized['input_ids'].to(device)
            attention_mask = tokenized['attention_mask'].to(device)

            with torch.no_grad():
                encoder_outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            token_repr = encoder_outputs.last_hidden_state

            if token_repr.shape[1] > max_length:
                token_repr = token_repr[:, :max_length, :]

            store_embedding(conn, original_seq, token_repr.squeeze(0))

            del token_repr, encoder_outputs, input_ids, attention_mask, tokenized, seq, original_seq
            gc.collect()

    # Close the database
    conn.close()


Using device: cuda:0


Processing protein sequences: 100%|██████████| 500/500 [00:00<00:00, 1039.77it/s]
Processing protein sequences: 100%|██████████| 500/500 [00:00<00:00, 1044.36it/s]
Processing protein sequences: 100%|██████████| 240/240 [00:00<00:00, 1060.95it/s]


### ProtT5 protein embedding

In [2]:
# =========================
# 1. Imports and Environment Setup
# =========================
import re
import os
import gc
import io
import torch
import sqlite3
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import T5Tokenizer, T5EncoderModel


# Select device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# =========================
# 2. Function Definitions
# =========================

def read_data_in_chunks(file_path, chunk_size):
    """Generator function to read data in chunks from a CSV file."""
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        yield chunk

def create_table(conn):
    """Creates the protein_embeddings table in the SQLite database if it doesn't exist."""
    conn.execute(''' 
    CREATE TABLE IF NOT EXISTS protein_embeddings (
        protein_sequence TEXT PRIMARY KEY,
        embedding BLOB
    )
    ''')
    conn.commit()

def store_embedding(conn, protein_sequence, embedding):
    """Stores protein sequence embeddings into the SQLite database."""
    buffer = io.BytesIO()
    np.save(buffer, embedding.cpu().numpy())
    buffer.seek(0)
    conn.execute('''
    INSERT OR REPLACE INTO protein_embeddings (protein_sequence, embedding)
    VALUES (?, ?)
    ''', (protein_sequence, sqlite3.Binary(buffer.read())))
    conn.commit()

def get_protein_embedding(conn, protein_sequence):
    """Retrieves the protein embedding from the database."""
    cursor = conn.execute('''
    SELECT embedding FROM protein_embeddings WHERE protein_sequence=?
    ''', (protein_sequence,))
    result = cursor.fetchone()
    if result:
        return np.load(io.BytesIO(result[0]), allow_pickle=True)
    return None

# =========================
# 3. Main Execution
# =========================

if __name__ == "__main__":
    # Define paths and parameters
    model_path = "./ProstT5/Rostlab/ProstT5"
    data_path = "./data/11betaHSD1/11betaHSD1.csv"
    db_path = "./data/11betaHSD1/ProtT5_target_protein_embeddings_11betaHSD1.db"
    # model_path = "/mnt/USR_DATA/ChenGeng/Project/CPI_baseline_model/A_wenzhang/ProstT5/Rostlab/ProstT5"
    # data_path = "/mnt/USR_DATA/ChenGeng/Project/CPI_baseline_model/A_wenzhang/ankh/5aga/HitScreen/data/11betaHSD1/11betaHSD1.csv"
    # db_path = "/mnt/USR_DATA/ChenGeng/Project/CPI_baseline_model/A_wenzhang/ankh/5aga/HitScreen/data/11betaHSD1/ProtT5_target_protein_embeddings_11betaHSD1.db"
    
    max_length = 1200
    chunk_size = 1000

    # Load tokenizer and model
    tokenizer = T5Tokenizer.from_pretrained(model_path, do_lower_case=False)
    model = T5EncoderModel.from_pretrained(model_path).to(device)
    model.eval()

    # Connect to the database
    conn = sqlite3.connect(db_path)
    create_table(conn)

    # Process protein sequences in chunks
    for df_chunk in read_data_in_chunks(data_path, chunk_size):
        for i in tqdm(range(len(df_chunk)), desc="Processing protein sequences"):
            original_seq = df_chunk.iloc[i]['Protein']

            # Skip if the embedding already exists in the database
            if get_protein_embedding(conn, original_seq) is not None:
                continue

            # Truncate the sequence if it's too long
            seq = original_seq[:1200] if len(original_seq) > 1200 else original_seq
            protein_sequences = [" ".join(list(re.sub(r"[UZOB]", "X", seq)))]
            protein_sequences = ["<AA2fold> " + s for s in protein_sequences]

            # Tokenize the sequence
            tokenized = tokenizer.batch_encode_plus(
                protein_sequences,
                add_special_tokens=True,
                padding="longest",
                return_tensors="pt"
            ).to(device)

            # Generate embeddings
            with torch.no_grad():
                embedding_repr = model(input_ids=tokenized.input_ids, attention_mask=tokenized.attention_mask)
            
            # Extract embedding and remove padding
            emb_0 = embedding_repr.last_hidden_state[0, 1:len(protein_sequences[0].split()) - 1]

            # Store the embedding in the database
            store_embedding(conn, original_seq, emb_0)

            # Clear memory
            del emb_0, embedding_repr, tokenized, protein_sequences, seq, original_seq
            gc.collect()

    # Close the database connection
    conn.close()


You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


Using device: cuda:0


Processing protein sequences: 100%|██████████| 1000/1000 [00:02<00:00, 483.28it/s]
Processing protein sequences: 100%|██████████| 240/240 [00:00<00:00, 1295.99it/s]


In [3]:
# =========================
# 1. Imports and Environment Setup
# =========================

import os
import gc
import io
import torch
import sqlite3
import numpy as np
import pandas as pd
from tqdm import tqdm
import esm


# Select device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# =========================
# 2. Function Definitions
# =========================

def read_data_in_chunks(file_path, chunk_size):
    """Generator function to read data in chunks from a CSV file."""
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        yield chunk

def create_table(conn):
    """Creates the protein_embeddings table in the SQLite database if it doesn't exist."""
    conn.execute(''' 
    CREATE TABLE IF NOT EXISTS protein_embeddings (
        protein_sequence TEXT PRIMARY KEY,
        embedding BLOB
    )
    ''')
    conn.commit()

def store_embedding(conn, protein_sequence, embedding):
    """Stores protein sequence embeddings into the SQLite database."""
    buffer = io.BytesIO()
    np.save(buffer, embedding.cpu().numpy())
    buffer.seek(0)
    conn.execute('''
    INSERT OR REPLACE INTO protein_embeddings (protein_sequence, embedding)
    VALUES (?, ?)
    ''', (protein_sequence, sqlite3.Binary(buffer.read())))
    conn.commit()

def get_protein_embedding(conn, protein_sequence):
    """Retrieves the protein embedding from the database."""
    cursor = conn.execute('''
    SELECT embedding FROM protein_embeddings WHERE protein_sequence=?
    ''', (protein_sequence,))
    result = cursor.fetchone()
    if result:
        return np.load(io.BytesIO(result[0]), allow_pickle=True)
    return None

def process_data(file_path, conn, batch_converter, model, max_length=1200):
    """Processes protein sequences, generates embeddings, and stores them in the database."""
    for df_chunk in read_data_in_chunks(file_path, chunk_size=1000):
        for i in tqdm(range(len(df_chunk)), desc="Preprocessing data"):
            original_v_p = df_chunk.iloc[i]['Protein']
            
            # Skip if the embedding already exists in the database
            if get_protein_embedding(conn, original_v_p) is not None:
                continue
            
            # Truncate the sequence if it exceeds max_length
            v_p = original_v_p[:1200] if len(original_v_p) > 1200 else original_v_p
            
            # Convert the protein sequence into an embedding
            data = [("protein1", v_p)]
            _, _, batch_tokens = batch_converter(data)
            with torch.no_grad():
                results = model(batch_tokens, repr_layers=[30], return_contacts=True)
                token_representations = results["representations"][30]
            
            # Remove the batch dimension and process the embedding
            token_representations = token_representations.squeeze(0)
            
            # Truncate to max_length if necessary
            if token_representations.shape[0] > max_length:
                token_representations = token_representations[:max_length]
            
            # Store the embedding in the database
            store_embedding(conn, original_v_p, token_representations)

            # Free up memory
            del token_representations, results, batch_tokens, data, v_p, original_v_p
            gc.collect()

# =========================
# 3. Main Execution
# =========================

if __name__ == "__main__":
    # Define file paths
    data_path = "/mnt/USR_DATA/ChenGeng/Project/CPI_baseline_model/A_wenzhang/ankh/5aga/HitScreen/data/11betaHSD1/11betaHSD1.csv"
    db_path = "/mnt/USR_DATA/ChenGeng/Project/CPI_baseline_model/A_wenzhang/ankh/5aga/HitScreen/data/11betaHSD1/ESM-2_150M_target_protein_embeddings_11betaHSD1.db"

    # Load ESM model and alphabet
    model, alphabet = esm.pretrained.esm2_t30_150M_UR50D()
    batch_converter = alphabet.get_batch_converter()
    model.eval()

    # Connect to the database
    conn = sqlite3.connect(db_path)
    create_table(conn)

    # Process protein sequences and generate embeddings
    process_data(data_path, conn, batch_converter, model)

    # Close the database connection
    conn.close()


Using device: cuda:0


Preprocessing data: 100%|██████████| 1000/1000 [00:01<00:00, 685.17it/s]
Preprocessing data: 100%|██████████| 240/240 [00:00<00:00, 1702.18it/s]
