In [None]:
import pandas as pd
import numpy as np
from eosce.models import ErsiliaCompoundEmbeddings
from pathlib import Path



In [None]:
# 1. Load Tox21 dataset (Parquet format)
tox21_path = Path("../data/Single/tox21_NR-AR.parquet")  # Replace with your actual file path
tox21_df = pd.read_parquet(tox21_path)

In [None]:
# 2. Initialize Ersilia's compound embedding model
model = ErsiliaCompoundEmbeddings()  # Uses eos2gw4 under the hood

# 3. Featurize SMILES in batches (memory-efficient)
batch_size = 1000
embeddings = []

In [None]:
for i in range(0, len(tox21_df), batch_size):
    batch = tox21_df["Drug"].iloc[i:i+batch_size].dropna().tolist()  # Assumes column named "smiles"
    if batch:
        embeddings.extend(model.transform(batch))  # Returns 1024-dim vectors

In [None]:
# 4. Save featurized data
featurized_df = tox21_df.copy().iloc[:len(embeddings)]  # Match original rows
featurized_df["embedding"] = embeddings  # Add new column with embeddings
featurized_df.to_parquet("../output/Single/tox21_NR-AR_featurized.parquet")  # Save back to Parquet

In [None]:
pd.read_parquet("../output/Single/tox21_NR-AR_featurized.parquet").head()

Unnamed: 0,Drug_ID,Drug,Y,embedding
0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,0.0,"[0.05185101, 0.19479074, 0.052489955, -0.09554..."
1,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,0.0,"[0.12783396, -0.0024298094, -0.071678214, -0.0..."
2,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,0.0,"[0.14798662, -0.098763496, -0.14953455, -0.125..."
3,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,0.0,"[-0.0025056677, -0.045134634, -0.031868976, 0...."
4,TOX5110,CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C,0.0,"[-0.031643886, -0.07929968, 0.00910799, -0.133..."


In [None]:
assert len(featurized_df) == len(embeddings), "Row count mismatch!"
assert all(len(emb) == 1024 for emb in featurized_df["embedding"]), "Incorrect embedding dimensions!"
assert featurized_df["Drug_ID"].nunique() == len(featurized_df), "Duplicate Drug_IDs!"
print("✅ All checks passed!")

✅ All checks passed!
