# Representations

## 1. Imports

In [2]:
import numpy as np
import pandas as pd

from numpy.typing import NDArray
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator

# MolE representations were obtained following the instructions in the original repository:
# https://github.com/rolayoalarcon/MolE?tab=readme-ov-file
# Using a dedicated conda environment with the specified dependencies, 
# and running the provided scripts to generate the representations for our dataset. 
# The resulting TSV file was then read into a DataFrame for analysis.

import torch
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


## 2. Molecular Fingerprints

### Create modifiable generator

In [3]:
### Morgan fingerprint generation using RDKit's rdFingerprintGenerator with count-based encoding. ###
# Create the generator once at import time
def get_morgan_generator(radius: int = 3, n_bits: int = 2048):
    return rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)

# Default global generator (common case)
_default_gen = get_morgan_generator()

### Define function

In [4]:
def smiles_to_morgan_fingerprint(smiles: str, generator = _default_gen) -> NDArray[np.int16]:
    N_BITS = generator.GetOptions().fpSize
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(N_BITS, dtype=np.int16) 
    count_fp = generator.GetCountFingerprint(mol)
    # Proper conversion from sparse vector to dense array
    fp_array = np.zeros(N_BITS, dtype=np.int16)
    
    # Fill in the non-zero elements
    for idx, count_val in count_fp.GetNonzeroElements().items():
        fp_array[idx] = count_val
    return fp_array

## 3. MolE embeddings

Texte explaining the way we got it and the outcome (float64) 1000 dimensions

## 4. chemBERTa embeddings

### Load model

In [5]:
# Load the pre-trained ChemBERTa model and tokenizer once at import time
_CHEMBERTA_MODEL_NAME = "seyonec/ChemBERTa-zinc-base-v1"

_tokenizer = AutoTokenizer.from_pretrained(_CHEMBERTA_MODEL_NAME)
_chemberta_model = AutoModel.from_pretrained(_CHEMBERTA_MODEL_NAME)
_chemberta_model.eval()  # disable dropout


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 651.62it/s, Materializing param=pooler.dense.weight]                             
[1mRobertaModel LOAD REPORT[0m from: seyonec/ChemBERTa-zinc-base-v1
Key                       | Status     |  | 
--------------------------+------------+--+-
lm_head.dense.weight      | UNEXPECTED |  | 
lm_head.decoder.weight    | UNEXPECTED |  | 
lm_head.dense.bias        | UNEXPECTED |  | 
lm_head.bias              | UNEXPECTED |  | 
lm_head.layer_norm.weight | UNEXPECTED |  | 
lm_head.decoder.bias      | UNEXPECTED |  | 
lm_head.layer_norm.bias   | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(767, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-5): 6 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): 

Explain wtf is this

### Def function

In [None]:
def chemberta_embedder(smiles: str) -> NDArray[np.float32]:
    inputs = _tokenizer(
        smiles,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=1024 #vancomycin's SMILES has more than 700 tokens, 
        #so we set a higher limit to avoid truncation for large molecules
    )
    with torch.no_grad():
        outputs = _chemberta_model(**inputs)

    # Mean pooling over token embeddings (excluding padding)
    last_hidden = outputs.last_hidden_state  # (1, seq_len, hidden_dim)
    attention_mask = inputs["attention_mask"].unsqueeze(-1)

    masked_hidden = last_hidden * attention_mask
    sum_hidden = masked_hidden.sum(dim=1)
    valid_tokens = attention_mask.sum(dim=1)

    embedding = sum_hidden / valid_tokens

    return embedding.squeeze(0).cpu().numpy().astype(np.float64) #float 64 is used for compatibility with MolE embeddings, which are also float64.


## 5. Harmonization in one function

In [None]:

def featurize_smiles(smiles: str, method: str = "morgan") -> NDArray[np.int16 | np.float64]:
    if method == "morgan":
        return smiles_to_morgan_fingerprint(smiles).astype(np.int16)
    elif method == "chemberta":
        return chemberta_embedder(smiles).astype(np.float64)
    else:
        raise ValueError(f"Unknown featurization method: {method}")

: 

## 6. function calls

In [None]:
db = pd.read_csv("DrugBank_curated_df.csv")

ids = []
fingerprints = []
chemberta_embeddings = []

for compound_id, smiles in zip(db["DrugBank ID"], db["SMILES"]):
    fp = featurize_smiles(smiles, method="morgan")
    emb = featurize_smiles(smiles, method="chemberta")
    ids.append(compound_id)
    fingerprints.append(fp)
    chemberta_embeddings.append(emb)

## 7. Data formatting

In [None]:
# Convert to 2D numpy array
fingerprint_matrix = np.vstack(fingerprints)
chemberta_matrix = np.vstack(chemberta_embeddings)

    # Create DataFrame with explicit column names
morgan_df = pd.DataFrame(
    fingerprint_matrix,
    index=ids,
    columns=[str(i) for i in range(fingerprint_matrix.shape[1])]
)
chemberta_df = pd.DataFrame(
    chemberta_matrix,
    index=ids,
    columns=[str(i) for i in range(chemberta_matrix.shape[1])]
)

## 8. Data comprobations

In [None]:
mole_df = pd.read_csv("MolE_output_representation.tsv", sep="\t", index_col=0)
print("Shape of the MolE representation DataFrame:")
print(mole_df.shape)
print("Shape of the Morgan fingerprint DataFrame:")
print(morgan_df.shape)
print(round(morgan_df.memory_usage(deep=True).sum() / 1024**2, 2), "MB")
print("Shape of the ChemBERTa embedding DataFrame:")
print(chemberta_df.shape)