# ChemBERTa/end2end Feature Generation


This notebook demonstrates how to generate ChemBERTa embeddings for host-guest pairs using the trained end2end models. The embeddings are extracted from the transformer layer before the FNN processing. A trained end2end model (string or string_finetuned) is required.

In [None]:
from pathlib import Path
import pandas as pd
import sys

# Add parent directory to path for imports
sys.path.append(str(Path.cwd().parent))

from cd_host_guest.modeling.train import EndToEndTrainer
from cd_host_guest.config import MODELS_DIR, PROCESSED_DATA_DIR, EXTERNAL_DATA_DIR

In [None]:
# Define paths
data_source = "OpenCycloDB"
representation = "string"

# Paths for trainer initialization
features_path = PROCESSED_DATA_DIR / data_source / f"{data_source}_{representation}.csv"
labels_path = PROCESSED_DATA_DIR / data_source / f"{data_source}_labels.csv"
model_dir = MODELS_DIR / data_source / representation

# Initialize trainer
trainer = EndToEndTrainer(
    features_path=features_path,
    labels_path=labels_path,
    model_path=model_dir,
    verbose=True,
)

print(f"Trainer initialized with model directory: {model_dir}")

In [None]:
# Load the pretrained ChemBERTa model (no end2end training)
print("Loading pretrained ChemBERTa model...")
trainer.build_SMILESTokenizer_layer()
print(f"‚úÖ Pretrained model loaded: {trainer.transformer}")

# Now we can extract embeddings using the pretrained model
# Set skip_model_loading=True since we already loaded the model
pretrained_output_path = (
    PROCESSED_DATA_DIR
    / data_source
    / f"{data_source}_canonical_embeddings_pretrained.csv"
)
data_path = features_path

print(f"\nExtracting embeddings from pretrained model...")
print(f"Input: {data_path}")
print(f"Output: {pretrained_output_path}")

pretrained_embeddings = trainer.extract_embeddings(
    data_path=data_path,
    model_path=None,  # Not loading a trained model
    output_path=pretrained_output_path,
    skip_model_loading=True,  # Use the already-loaded pretrained model
)

print(f"\n‚úÖ Pretrained embeddings extracted successfully!")
print(f"Shape: {pretrained_embeddings.shape}")

In [None]:
# Extract pretrained embeddings for external validation datasets
cd_val_data_path = EXTERNAL_DATA_DIR / "cd_val" / "cd_val_canonical.csv"
cd_val_pretrained_output = (
    EXTERNAL_DATA_DIR / "cd_val" / "cd_val_canonical_embeddings_pretrained.csv"
)
print(f"\nüìä Processing cd_val with pretrained model...")
cd_val_pretrained = trainer.extract_embeddings(
    data_path=cd_val_data_path,
    model_path=None,
    output_path=cd_val_pretrained_output,
    skip_model_loading=True,
)
print(f"‚úÖ cd_val pretrained embeddings: {cd_val_pretrained.shape}")

pfas_val_data_path = EXTERNAL_DATA_DIR / "pfas_val" / "pfas_val_canonical.csv"
pfas_val_pretrained_output = (
    EXTERNAL_DATA_DIR / "pfas_val" / "pfas_val_canonical_embeddings_pretrained.csv"
)
print(f"\nüìä Processing pfas_val with pretrained model...")
pfas_val_pretrained = trainer.extract_embeddings(
    data_path=pfas_val_data_path,
    model_path=None,
    output_path=pfas_val_pretrained_output,
    skip_model_loading=True,
)
print(f"‚úÖ pfas_val pretrained embeddings: {pfas_val_pretrained.shape}")

In [None]:
# Path to the trained model (base string model)
model_path = model_dir / "end2end_string.pth"

# Input data path (canonical SMILES)
data_path = PROCESSED_DATA_DIR / data_source / f"{data_source}_canonical.csv"

# Output path for embeddings
output_path = (
    PROCESSED_DATA_DIR / data_source / f"{data_source}_canonical_embeddings.csv"
)

# Extract embeddings
print(f"\nExtracting embeddings from: {data_path}")
print(f"Using model: {model_path}")
print(f"Saving to: {output_path}")

embeddings_df = trainer.extract_embeddings(
    data_path=data_path, model_path=model_path, output_path=output_path
)

print(f"\n‚úÖ Embeddings extracted successfully!")
print(f"Shape: {embeddings_df.shape}")

In [None]:
# Extract embeddings for cd_val
cd_val_output_path = EXTERNAL_DATA_DIR / "cd_val" / "cd_val_canonical_embeddings.csv"

print(f"\nüìä Processing cd_val dataset...")
print(f"Input: {cd_val_data_path}")
print(f"Output: {cd_val_output_path}")

cd_val_embeddings = trainer.extract_embeddings(
    data_path=cd_val_data_path,
    model_path=model_path,  # Reuse the already loaded model
    output_path=cd_val_output_path,
)

print(f"‚úÖ cd_val embeddings: {cd_val_embeddings.shape}")

In [None]:
# Extract embeddings for pfas_val
pfas_val_output_path = (
    EXTERNAL_DATA_DIR / "pfas_val" / "pfas_val_canonical_embeddings.csv"
)

print(f"\nüìä Processing pfas_val dataset...")
print(f"Input: {pfas_val_data_path}")
print(f"Output: {pfas_val_output_path}")

pfas_val_embeddings = trainer.extract_embeddings(
    data_path=pfas_val_data_path,
    model_path=model_path,  # Reuse the already loaded model
    output_path=pfas_val_output_path,
)

print(f"‚úÖ pfas_val embeddings: {pfas_val_embeddings.shape}")

In [None]:
# Path to the fine-tuned model
finetuned_model_path = model_dir / "end2end_string_finetuned_final.pth"

# Check if fine-tuned model exists
if finetuned_model_path.exists():
    print(f"üîÑ Using fine-tuned model: {finetuned_model_path}")

    # Extract embeddings for training data with fine-tuned model
    finetuned_output_path = (
        PROCESSED_DATA_DIR
        / data_source
        / f"{data_source}_canonical_embeddings_finetuned.csv"
    )

    finetuned_embeddings = trainer.extract_embeddings(
        data_path=data_path,
        model_path=finetuned_model_path,
        output_path=finetuned_output_path,
    )

    print(f"‚úÖ Fine-tuned embeddings: {finetuned_embeddings.shape}")

    # Extract for external validation datasets
    cd_val_finetuned_output = (
        EXTERNAL_DATA_DIR / "cd_val" / "cd_val_canonical_embeddings_finetuned.csv"
    )
    cd_val_finetuned = trainer.extract_embeddings(
        data_path=cd_val_data_path,
        model_path=finetuned_model_path,
        output_path=cd_val_finetuned_output,
    )
    print(f"‚úÖ cd_val fine-tuned embeddings: {cd_val_finetuned.shape}")

    pfas_val_finetuned_output = (
        EXTERNAL_DATA_DIR / "pfas_val" / "pfas_val_canonical_embeddings_finetuned.csv"
    )
    pfas_val_finetuned = trainer.extract_embeddings(
        data_path=pfas_val_data_path,
        model_path=finetuned_model_path,
        output_path=pfas_val_finetuned_output,
    )
    print(f"‚úÖ pfas_val fine-tuned embeddings: {pfas_val_finetuned.shape}")
else:
    print(f"‚ö†Ô∏è Fine-tuned model not found at: {finetuned_model_path}")
    print("Skipping fine-tuned embeddings extraction.")