# 1. Installation of requirements

In [1]:
# required packages and basic function(s)
from rdkit import Chem
from mordred import Calculator, descriptors
import pandas as pd
import numpy as np
import math
import random
import tqdm
import os
import shutil
from pathlib import Path
import subprocess
import logging
from multiprocessing import freeze_support

logging.basicConfig(level=logging.INFO)
freeze_support()

def safe_div(value, factor, feature_name, printed_flag):
    try:
        return value / np.float64(factor)
    except Exception:
        # only print the first time this feature fails
        if not printed_flag[0]:
            print(f"{feature_name} unable to be calculated, setting invalid values to 0")
            printed_flag[0] = True
        return 0

# 2. Splitting data into test/train splits


The pre-trained AGILE deep learning model is provided in `AGILE/ckpt/pretrained_agile_60k` and will be fine-tuned on five cross-validation splits.

The data provided in [repo](https://github.com/bowang-lab/AGILE) was split (80% train/20% validation) randomly to create these splits.

In this section we will split the data.

In [6]:
# split AGILE data (Morgan fingerprints included)
random.seed(0)
no_splits = 5

# read from ../LNPDB/data/LNPDB_for_AGILE/data
df = pd.read_csv("../data/LNPDB_for_AGILE/AGILE/data/data/finetuning_set_smiles_plus_features.csv")

def create_df(name):
    name = pd.DataFrame()
    return name

output_dataframes=[]
for i in range(no_splits):
    output_dataframes.append(create_df(f"df{i}"))

complement_dataframes=[]
for k in range(no_splits):
    complement_dataframes.append(create_df(f"df{k}c"))

for index in range(len(df)):
    row = df.iloc[[index]]
    assignment = random.randint(0,no_splits-1)
    while len(output_dataframes[assignment])>=len(df)/no_splits:
        assignment = random.randint(0,no_splits-1)
    output_dataframes[assignment] = pd.concat([output_dataframes[assignment], row], ignore_index=True)
    for complement_assignment in range(no_splits):
        if complement_assignment != assignment:
            complement_dataframes[complement_assignment] = pd.concat([complement_dataframes[complement_assignment], row], ignore_index=True)

# save to ../LNPDB/data/LNPDB_for_AGILE/cv_splits
cv_dir = Path("../data/LNPDB_for_AGILE/cv_splits")
cv_dir.mkdir(parents=True, exist_ok=True)

for j, dataframe in enumerate(output_dataframes):
    dataframe.to_csv(cv_dir / f"df{j}_test.csv", index=False)

for k, dataframe in enumerate(complement_dataframes):
    dataframe.to_csv(cv_dir / f"df{k}_train.csv", index=False)

logging.info("Cross-validation splits saved to cv_splits/")


INFO:root:Cross-validation splits saved to cv_splits/


# 3. Finetuning models on cross-validation splits

The following section involves preparing files according to [AGILE](https://github.com/bowang-lab/AGILE).
    
Note that the trained model checkpoints are already provided at `LNPDB/data/LNPDB_for_AGILE/cv_splits`, so it is not necessary to run the commands.

The models and their results are now placed in `LNPDB/data/LNPDB_for_AGILE/cv_splits`.

## 3.1 Preparing finetune_LNPDB.py and YAML/split

In [10]:
lnpdb_agile_base = Path("../data/LNPDB_for_AGILE")
scripts_dir = lnpdb_agile_base / "scripts"
agile_dir = lnpdb_base / "data/LNPDB_for_AGILE/AGILE"

# Copy finetune script + yaml into AGILE, and patch YAML per split
finetune_py = scripts_dir / "finetune_LNPDB.py"
finetune_yaml_template = scripts_dir / "config_finetune.yaml"

# Ensure files exist
assert finetune_py.exists(), "finetune_LNPDB.py not found in scripts/"
assert finetune_yaml_template.exists(), "config_finetune.yaml not found in scripts/"

# Copy finetune_LNPDB.py into AGILE
shutil.copy(finetune_py, agile_dir / "finetune_LNPDB.py")

# Generate one YAML per split
finetune_config_dir = agile_dir / "finetune"
finetune_config_dir.mkdir(parents=True, exist_ok=True)

for i in range(5):
    yaml_target = finetune_config_dir / f"agile_lnp_hela_cv_{i}.yaml"
    with open(finetune_yaml_template, "r") as f:
        yaml_text = f.read()
    yaml_text = yaml_text.replace(
        "task_name: lnp_hela_with_feat",
        f"task_name: LNPDB_split_{i}"
    )
    with open(yaml_target, "w") as f:
        f.write(yaml_text)

logging.info("finetune_LNPDB.py and split YAMLs prepared in AGILE/")


INFO:root:finetune_LNPDB.py and split YAMLs prepared in AGILE/


## 3.2 Finetuning models on respective split train data

In [None]:
# copy finetune_LNPDB.py from ../LNPDB/data/LNPDB_for_AGILE/scripts into ../LNPDB/data/LNPDB_for_AGILE/AGILE
# also copy yamls in similar manner

# finetune pre-trained AGILE model on 5 cross-validation splits
cv_number_list = [0,1,2,3,4]

for i in cv_number_list:
    yaml_path = agile_dir / f"finetune/agile_lnp_hela_cv_{i}.yaml"
    command = ["python", str(agile_dir / "finetune_LNPDB.py"), str(yaml_path)]
    result = subprocess.run(command, cwd=agile_dir, capture_output=True, text=True)
    print(result.stdout)
    print(result.stderr)

# 4. Generating LNPDB data feature descriptors

To use the trained models to predict delivery efficacy for new LNP data, LNPDB data has been placed into the folder `LNPDB/data/LNPDB_for_AGILE/LNPDB_data`.

AGILE requires data to be processed into Mordred molecular feature descriptors, which are generated using [repo](https://github.com/mordred-descriptor/mordred) as described in the sixth cell of the notebook. Note that the "descriptors_full" output is from directly generating Mordred descriptors, while the "plus_features" output is with adjustment to required AGILE format. For all LNPDB data included in the paper, this repository already contains the corresponding Mordred descriptors in `LNPDB/data/LNPDB_for_AGILE/LNPDB_data`.

Our Mordred descriptors were validated by comparing generation from AGILE original SMILES and provided fingerprints. The procedure is to generate Mordred descriptors using solely the smiles and Experiment_value columns, and then compare the resulting descriptors with those provided by the AGILE [repo](https://github.com/bowang-lab/AGILE).

## 4.1 Generate descriptors

In [19]:
# Generate Mordred descriptors for heldout datasets and align with known features
def heldout_data(heldout_data_path: str,
                 heldout_data_output_folder: str,
                 known_data_path: str) -> None:
    """
    Generate Mordred descriptors for molecules in heldout_data, then select only
    those features that appear in known_data and append them to the heldout_data.
    Saves both the full descriptor table and the filtered heldout_data to CSV files.

    Parameters:
    - heldout_data_path: path to CSV with columns ['smiles', ...]
    - heldout_data_output_folder: folder to create for output files
    - known_data_path: path to CSV whose columns define the desired features
    """

    df_heldout = pd.read_csv(heldout_data_path)
    logging.info(f"Read heldout data with {len(df_heldout)} molecules.")

    mols = [Chem.MolFromSmiles(smi) for smi in df_heldout['smiles']]
    calc = Calculator(descriptors, ignore_3D=True)
    df_desc = calc.pandas(mols, nproc=1)

    # clean columns
    if isinstance(df_desc.columns, pd.MultiIndex):
        df_desc.columns = [str(name) for name, _ in df_desc.columns]
    else:
        df_desc.columns = [str(c) for c in df_desc.columns]

    df_desc.replace([np.inf, -np.inf], pd.NA, inplace=True)

    if df_desc.isna().sum().sum() > 0:
        logging.warning(f"Detected {df_desc.isna().sum().sum()} NaN or overflow values after descriptor calculation.")

    os.makedirs(heldout_data_output_folder, exist_ok=True)
    dataset_name = Path(heldout_data_path).stem

    desc_full_path = os.path.join(heldout_data_output_folder,
                                  f"{dataset_name}_descriptors_full.csv")
    df_desc.to_csv(desc_full_path, index=False)
    logging.info(f"Full descriptor table saved to {desc_full_path}.")

    df_known = pd.read_csv(known_data_path)
    desired_cols = [col for col in df_known.columns if col.startswith('desc_')]
    logging.info(f"Found {len(desired_cols)} desired desc_* columns in known data.")

    # collect matching columns
    new_features = {}
    missing_features = []
    overflow_features = []
    # Extract feature names from desired columns
    scaling_factor = None
    for col in desired_cols:
        # some features used by AGILE are scaled
        if '/' in col:
            feature_name, scaling_factor = col[len('desc_'):].split('/')
        else:
            feature_name = col[len('desc_'):] 

        if feature_name in df_desc.columns:
            feature_values = df_desc[feature_name]
            printed_flag = [False]
            if feature_values.isna().any():
                logging.warning(f"Feature '{feature_name}' contains NaNs due to overflow during calculation.")
                overflow_features.append(feature_name)
            # else if feature_name == 'MAXssNH' or feature_name == 'MINssNH':
            elif scaling_factor:
                new_features[col] = [
                    safe_div(value, scaling_factor, feature_name, printed_flag)
                    for value in feature_values.values
                ]
            else:
                new_features[col] = [
                    safe_div(value, 1, feature_name, printed_flag)
                    for value in feature_values.values
                ]
            logging.info(f"Added feature column '{col}' from descriptor '{feature_name}'.")
        else:
            # some features used by AGILE are log-Mordred features
            if feature_name == 'log_VR1_A' or feature_name == 'log_VR2_A' or feature_name == 'log_SdssC':
                feature_values = df_desc[f'{feature_name[4:]}']
                if feature_values.isna().any():
                    logging.warning(f"Feature '{feature_name}' contains NaNs due to overflow during calculation.")
                    overflow_features.append(f'log_{feature_name}')
                new_features[col] = [math.log10(value) if value>0 else 0 for value in feature_values.values]
                logging.info(f"Added feature column '{col}' from descriptor '{feature_name}'.")
            else:
                logging.warning(f"Descriptor '{feature_name}' not found in calculated descriptors. Filling NaNs.")
                new_features[col] = [pd.NA] * len(df_heldout)
                missing_features.append(feature_name)

        scaling_factor = None

    df_new_features = pd.DataFrame(new_features)
    df_new_features = df_new_features.reindex(columns=desired_cols)
    df_heldout = pd.concat([df_heldout.reset_index(drop=True),
                            df_new_features.reset_index(drop=True)], axis=1)
    # df_heldout.rename(columns={'IL_SMILES': 'smiles'}, inplace=True)

    # 🔹 Ensure only known features are kept (drop extra Mordred descriptors)
    keep_cols = [c for c in df_heldout.columns if not c.startswith("desc_")] + desired_cols
    df_heldout = df_heldout[keep_cols]

    # concatenate smiles + labels (non-desc_) with filtered descriptors
    non_desc_cols = [c for c in df_heldout.columns if not c.startswith("desc_")]
    df_heldout_final = pd.concat(
        [df_heldout[non_desc_cols].reset_index(drop=True),
         df_new_features.reset_index(drop=True)],
        axis=1
    )

    # Ensure final output columns match the order in known_data
    ref_cols = list(df_known.columns)  # full reference column order
    non_desc_cols = [c for c in df_heldout_final.columns if not c.startswith("desc_")]

    # keep non-desc columns first, then descriptors in reference order
    ordered_cols = non_desc_cols + [c for c in ref_cols if c.startswith("desc_")]

    df_heldout_final = df_heldout_final.reindex(columns=ordered_cols)
    
    heldout_out_path = os.path.join(heldout_data_output_folder, f"{dataset_name}_plus_features.csv")

    df_heldout_final.to_csv(heldout_out_path, index=False)

    logging.info(f"Augmented heldout data (with {len(desired_cols)} descriptors) saved to {heldout_out_path}")

    # Warn about missing features if any
    if missing_features:
        logging.warning(f"{len(missing_features)} features were missing: {missing_features}")

    if overflow_features:
        logging.warning(f"{len(overflow_features)} features contain NaN values due to overflow: {overflow_features}")

# Example call (edit datasets as needed)
heldout_data(
    "../data/LNPDB_for_AGILE/AGILE/data/data/finetuning_set_smiles_plus_features.csv", # only change this argument for each dataset
    "../data/LNPDB_for_AGILE/outputs",
    "../data/LNPDB_for_AGILE/AGILE/data/data/finetuning_set_smiles_plus_features.csv"
)

INFO:root:Read heldout data with 1200 molecules.
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
100%|██████████████████████████████████████████████████████████████████████████████| 1200/1200 [04:39<00:00,  4.29it/s]
INFO:root:Full descriptor table saved to ../data/LNPDB_for_AGILE/outputs\finetuning_set_smiles_plus_features_descriptors_full.csv.
INFO:root:Found 813 desired desc_* columns in known data.
INFO:root:Added feature column 'desc_ABC/10' from descriptor 'ABC'.
INFO:root:Added feature column 'desc_ABCGG/10' from descriptor 'ABCGG'.
INFO:root:Added feature column 'desc_nBase' from descriptor 'nBase'.
INFO:root:Added feature column 'desc_SpAbs_A/30' from descriptor 'SpAbs_A'.
INFO:root:Added feature column 'desc_SpMax_A' from descriptor 'SpMax_A'.
INFO:root:Added feature column 'desc_SpDiam_A' from descriptor 'SpDiam_A'.
INFO:root:Added feature column 'desc_SpAD_A/30' from descriptor 'SpAD_A'.
INFO:root:Added feature column 'desc_SpMAD_A' from descriptor 'SpMAD_A'.
INF

## 4.2 Validate descriptor generation

In [20]:
# Compare generated Mordred descriptors vs. known features, with summary stats
def compare_descriptors(generated_path: str, known_path: str, n_check: int = 5,
                        atol: float = 1e-6, rtol: float = 1e-4):
    """
    Compare a generated Mordred descriptor CSV to a known training feature CSV.

    Parameters:
    - generated_path: path to heldout_data_output.csv (with descriptors)
    - known_path: path to finetuning_set_smiles_plus_features.csv
    - n_check: number of molecules to sample for detailed row comparison
    - atol, rtol: tolerances for numerical equality (absolute, relative)
    """
    df_gen = pd.read_csv(generated_path)
    df_known = pd.read_csv(known_path)

    # Select only descriptor columns (start with desc_)
    gen_cols = [c for c in df_gen.columns if c.startswith("desc_")]
    known_cols = [c for c in df_known.columns if c.startswith("desc_")]

    # Check overlap
    overlap = sorted(set(gen_cols) & set(known_cols))
    missing_in_gen = set(known_cols) - set(gen_cols)
    extra_in_gen = set(gen_cols) - set(known_cols)

    print(f"Total known features: {len(known_cols)}")
    print(f"Total generated features: {len(gen_cols)}")
    print(f"Overlap: {len(overlap)}")
    if missing_in_gen:
        print(f"❌ Missing in generated: {sorted(list(missing_in_gen))[:10]} ...")
    if extra_in_gen:
        print(f"⚠️ Extra in generated: {sorted(list(extra_in_gen))[:10]} ...")

    # Early exit if no overlap
    if not overlap:
        print("No overlapping descriptors to compare.")
        return

    # Align dataframes to overlap
    df_gen_overlap = df_gen[overlap].reset_index(drop=True)
    df_known_overlap = df_known[overlap].reset_index(drop=True)

    # Check shape match
    n_rows = min(len(df_gen_overlap), len(df_known_overlap))
    df_gen_overlap = df_gen_overlap.iloc[:n_rows]
    df_known_overlap = df_known_overlap.iloc[:n_rows]

    # Compare all values
    diffs_mask = ~np.isclose(df_gen_overlap.values,
                             df_known_overlap.values,
                             atol=atol, rtol=rtol, equal_nan=True)

    n_total = diffs_mask.size
    n_diffs = np.count_nonzero(diffs_mask)
    pct_match = 100 * (1 - n_diffs / n_total)

    # Molecule-level agreement
    row_match = np.all(~diffs_mask, axis=1)
    pct_rows_match = 100 * np.mean(row_match)

    print(f"\nSummary:")
    print(f"- Total values compared: {n_total}")
    print(f"- Mismatched values: {n_diffs}")
    print(f"- % values matching: {pct_match:.2f}%")
    print(f"- % molecules with all descriptors matching: {pct_rows_match:.2f}%")

    # Column order check (binary)
    same_order = gen_cols == known_cols
    print(f"Column order identical: {same_order}")

# generate features using original AGILE data; can be done with previous code chunk
# heldout_data(
#     "../LNPDB/data/LNPDB_for_AGILE/AGILE/data/data/finetuning_set_smiles_plus_features.csv", # only change this argument for each dataset
#     "../LNPDB/data/LNPDB_for_AGILE/outputs",
#     "../LNPDB/data/LNPDB_for_AGILE/AGILE/data/data/finetuning_set_smiles_plus_features.csv"
# )
            
# verify mordred generation algorithm on original AGILE data
compare_descriptors(
    "../data/LNPDB_for_AGILE/outputs/finetuning_set_smiles_plus_features_plus_features.csv",
    "../data/LNPDB_for_AGILE/AGILE/data/data/finetuning_set_smiles_plus_features.csv"
)

Total known features: 813
Total generated features: 813
Overlap: 813

Summary:
- Total values compared: 975600
- Mismatched values: 0
- % values matching: 100.00%
- % molecules with all descriptors matching: 100.00%

Checking 5 random molecules for descriptor agreement:
Row 510: ✅ all overlapping descriptors match
Row 1170: ✅ all overlapping descriptors match
Row 495: ✅ all overlapping descriptors match
Row 562: ✅ all overlapping descriptors match
Row 379: ✅ all overlapping descriptors match


# 5. Evaluating models on AGILE and LNPDB data

Once the molecular feature descriptors are generated, AGILE splits can make predictions on delivery efficacy for LNPDB data.

The following blocks move`infer_vis_LNPDB.py` into AGILE from `LNPDB/data/LNPDB_for_AGILE/scripts`, evaluate the fine-tuned AGILE models on the test cross-evaluation splits and make predictions on data.

## 5.1 Preparing infer_vis_LNPDB.py

In [None]:
# IMPORTANT: hardcode which csvs into infer_vis

# Copy infer_vis_LNPDB.py and infer yaml into AGILE model folders
infer_py = scripts_dir / "infer_vis_LNPDB.py"
infer_yaml_template = scripts_dir / "infer_vis/config_finetune.yaml"

# Ensure files exist
assert infer_py.exists(), "infer_vis_LNPDB.py not found in scripts/"
assert infer_yaml_template.exists(), "infer_vis/config_finetune.yaml not found in scripts/"

# Copy infer_vis_LNPDB.py into AGILE
shutil.copy(infer_py, agile_dir / "infer_vis_LNPDB.py")

# # Copy yaml into each split's checkpoints folder
# for i in range(5):
#     ckpt_dir = agile_dir / f"finetune/agile_lnp_hela_cv_{i}/checkpoints"
#     ckpt_dir.mkdir(parents=True, exist_ok=True)
#     shutil.copy(infer_yaml_template, ckpt_dir / "config_finetune.yaml")

logging.info("infer_vis_LNPDB.py copied.")

## 5.2 Evaluate fine-tuned models on test splits

In [None]:
# do cross-validation on the test splits
def test_split_validation() -> None:
    """
    Run the AGILE cross-evaluation splits on respective test splits
    """
    cv_number_list = [0, 1, 2, 3, 4]
    # run infer_vis
    for cv_number in cv_number_list:
        model_path = agile_dir / f"finetune/agile_lnp_hela_cv_{cv_number}"
        yaml_path = agile_dir / f"finetune/agile_lnp_hela_cv_{cv_number}/checkpoints/config_finetune.yaml"
        # replace yaml value based on dataset_name
        with open(yaml_path, "r") as f:
            yaml_text = f.read()
        yaml_text = yaml_text.replace("task_name: lnp_hela_with_feat", f"task_name: df{cv_number}_test")
        with open(yaml_path, "w") as f:
            f.write(yaml_text)
        command = ["python", str(agile_dir / "infer_vis_LNPDB.py"), str(model_path)]
        result = subprocess.run(command, cwd=agile_dir, capture_output=True, text=True)
        print(result.stdout)
        print(result.stderr)

        # TODO: update directory??
        # preds_path = agile_dir / f"finetune/agile_lnp_hela_cv_{cv_number}/LNPDB/preds_on_LNPDB.csv"
        # if preds_path.exists():
        #     df_preds = pd.read_csv(preds_path)
        #     save_dir = Path(dataset_name)
        #     save_dir.mkdir(parents=True, exist_ok=True)
        #     save_preds_path = save_dir / f"{dataset_name}_preds_on_LNPDB_cv_{cv_number}.csv"
        #     df_preds.to_csv(save_preds_path, index=False)
        #     logging.info(f"Predictions for split {cv_number} saved to {save_preds_path}")

test_split_validation()

## 5.3 Make predictions on LNPDB data

In [None]:
# evaluate cross-validation splits on dataset
def cross_evaluate() -> None:
    """
    Run the AGILE cross-evaluation splits on LNPDB data
    """
    cv_number_list = [0, 1, 2, 3, 4]
    # datasets to evaluate
    datasets = ["SL_2020_heldout_data"]
    for dataset_name of datasets:
        # run infer_vis
        for cv_number in cv_number_list:
            model_path = agile_dir / f"finetune/agile_lnp_hela_cv_{cv_number}"
            yaml_path = agile_dir / f"finetune/agile_lnp_hela_cv_{cv_number}/checkpoints/config_finetune.yaml"
            # replace yaml value based on dataset_name
            with open(yaml_path, "r") as f:
                yaml_text = f.read()
            yaml_text = yaml_text.replace("task_name: lnp_hela_with_feat", f"task_name: {dataset_name}")
            with open(yaml_path, "w") as f:
                f.write(yaml_text)
            command = ["python", str(agile_dir / "infer_vis_LNPDB.py"), str(model_path)]
            result = subprocess.run(command, cwd=agile_dir, capture_output=True, text=True)
            print(result.stdout)
            print(result.stderr)

            # TODO: update directory??
            # preds_path = agile_dir / f"finetune/agile_lnp_hela_cv_{cv_number}/LNPDB/preds_on_LNPDB.csv"
            # if preds_path.exists():
            #     df_preds = pd.read_csv(preds_path)
            #     save_dir = Path(dataset_name)
            #     save_dir.mkdir(parents=True, exist_ok=True)
            #     save_preds_path = save_dir / f"{dataset_name}_preds_on_LNPDB_cv_{cv_number}.csv"
            #     df_preds.to_csv(save_preds_path, index=False)
            #     logging.info(f"Predictions for split {cv_number} saved to {save_preds_path}")

cross_evaluate()