# Analysis of Fluorinated Compounds in OpenCycloDB and External Validation Sets

This notebook analyzes the presence of fluorinated compounds (containing fluorine atoms) in the OpenCycloDB training dataset and examines overlap with external validation datasets. 

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from pathlib import Path
from collections import Counter
import warnings

warnings.filterwarnings("ignore")

# Set up plotting style
plt.style.use("default")
sns.set_palette("husl")

# Define paths
data_dir = Path("../data")
processed_dir = data_dir / "processed/OpenCycloDB"
external_dir = data_dir / "external/validation"

print("Setup complete!")
print(f"Data directory: {data_dir}")
print(f"Processed directory: {processed_dir}")
print(f"External directory: {external_dir}")

Setup complete!
Data directory: ..\data
Processed directory: ..\data\processed\OpenCycloDB
External directory: ..\data\external\validation


In [4]:
def is_fluorinated(smiles):
    """
    Check if a SMILES string contains fluorine atoms.
    Returns True if fluorine is present, False otherwise.
    """
    if pd.isna(smiles) or not isinstance(smiles, str):
        return False
    return "F" in smiles


def count_fluorine_atoms(smiles):
    """
    Count the number of fluorine atoms in a SMILES string.
    """
    if pd.isna(smiles) or not isinstance(smiles, str):
        return 0
    return smiles.count("F")


def get_fluorine_patterns(smiles):
    """
    Identify common fluorinated patterns in SMILES string.
    """
    if pd.isna(smiles) or not isinstance(smiles, str):
        return []

    patterns = []

    # CF3 groups
    if "C(F)(F)F" in smiles:
        patterns.append("CF3")

    # CF2 groups
    if "C(F)(F)" in smiles:
        patterns.append("CF2")

    # Single F atoms
    if re.search(r"C.*F(?!\))", smiles):
        patterns.append("C-F")

    # Aromatic fluorine
    if re.search(r"c.*F", smiles):
        patterns.append("Ar-F")

    # Trifluoromethyl groups attached to aromatics
    if "OC(F)(F)F" in smiles:
        patterns.append("OCF3")

    return patterns


# Test the functions
test_smiles = ["CCO", "FC(F)(F)C(F)(F)F", "c1ccc(F)cc1", "CC(F)(F)F"]
for smiles in test_smiles:
    print(f"SMILES: {smiles}")
    print(f"  Fluorinated: {is_fluorinated(smiles)}")
    print(f"  F count: {count_fluorine_atoms(smiles)}")
    print(f"  Patterns: {get_fluorine_patterns(smiles)}")
    print()

SMILES: CCO
  Fluorinated: False
  F count: 0
  Patterns: []

SMILES: FC(F)(F)C(F)(F)F
  Fluorinated: True
  F count: 6
  Patterns: ['CF3', 'CF2', 'C-F']

SMILES: c1ccc(F)cc1
  Fluorinated: True
  F count: 1
  Patterns: ['Ar-F']

SMILES: CC(F)(F)F
  Fluorinated: True
  F count: 3
  Patterns: ['CF3', 'CF2', 'C-F']



## 1. Analysis of OpenCycloDB Training Dataset

Let's load the OpenCycloDB dataset and analyze the presence of fluorinated compounds in the guest molecules.

In [5]:
# Load OpenCycloDB raw data
opencyclodb_path = data_dir / "raw/OpenCycloDB/Data/CDEnrichedData.csv"
print(f"Loading OpenCycloDB data from: {opencyclodb_path}")

if opencyclodb_path.exists():
    opencd_df = pd.read_csv(opencyclodb_path)
    print(f"Loaded {len(opencd_df)} complexes")
    print(f"Columns: {list(opencd_df.columns)}")
    print(f"\nFirst few rows:")
    print(opencd_df.head())
else:
    print(f"File not found: {opencyclodb_path}")

# Check if we have guest SMILES
guest_smiles_col = None
for col in opencd_df.columns:
    if "guest" in col.lower() and "smiles" in col.lower():
        guest_smiles_col = col
        break

if guest_smiles_col:
    print(f"\nFound guest SMILES column: {guest_smiles_col}")
else:
    print("\nGuest SMILES column not found. Available columns:")
    print([col for col in opencd_df.columns if "smiles" in col.lower()])

Loading OpenCycloDB data from: ..\data\raw\OpenCycloDB\Data\CDEnrichedData.csv
Loaded 3459 complexes
Columns: ['Unnamed: 0', 'Host', 'CID_Host', 'Guest', 'CID_Guest', 'pH', 'T', 'DeltaG', 'Erreur', 'K', 'logK', 'Reference', 'Original_Value', 'IsomericSMILES', 'IsomericSMILES_Host', 'TPSA', 'MolecularWeight', 'Complexity', 'Charge', 'HBondDonorCount', 'HBondAcceptorCount', 'HeavyAtomCount', 'MolLogP', 'AromaticProportion', 'TPSA_Host', 'MolecularWeight_Host', 'Complexity_Host', 'Charge_Host', 'HBondDonorCount_Host', 'HBondAcceptorCount_Host', 'HeavyAtomCount_Host', 'iso2vec-0', 'iso2vec-1', 'iso2vec-2', 'iso2vec-3', 'iso2vec-4', 'iso2vec-5', 'iso2vec-6', 'iso2vec-7', 'iso2vec-8', 'iso2vec-9', 'iso2vec-host-0', 'iso2vec-host-1', 'iso2vec-host-2', 'iso2vec-host-3', 'iso2vec-host-4', 'iso2vec-host-5', 'iso2vec-host-6', 'iso2vec-host-7', 'iso2vec-host-8', 'iso2vec-host-9']

First few rows:
   Unnamed: 0                      Host  CID_Host                   Guest  \
0           1        alph

In [6]:
# Find the correct guest SMILES column
if "IsomericSMILES" in opencd_df.columns:
    guest_smiles_col = "IsomericSMILES"
elif "guest_smiles" in opencd_df.columns:
    guest_smiles_col = "guest_smiles"
elif "Guest" in opencd_df.columns:
    guest_smiles_col = "Guest"
else:
    # Look for any column with SMILES that might be guest data
    smiles_cols = [col for col in opencd_df.columns if "smiles" in col.lower()]
    if smiles_cols:
        guest_smiles_col = smiles_cols[0]
    else:
        print("No SMILES column found!")
        guest_smiles_col = None

if guest_smiles_col:
    print(f"Using column: {guest_smiles_col}")

    # Analyze fluorinated compounds
    opencd_df["is_fluorinated"] = opencd_df[guest_smiles_col].apply(is_fluorinated)
    opencd_df["fluorine_count"] = opencd_df[guest_smiles_col].apply(
        count_fluorine_atoms
    )
    opencd_df["fluorine_patterns"] = opencd_df[guest_smiles_col].apply(
        get_fluorine_patterns
    )

    # Summary statistics
    total_compounds = len(opencd_df)
    fluorinated_compounds = opencd_df["is_fluorinated"].sum()
    fluorinated_percentage = (fluorinated_compounds / total_compounds) * 100

    print(f"\n=== OpenCycloDB Fluorinated Compound Analysis ===")
    print(f"Total guest compounds: {total_compounds:,}")
    print(f"Fluorinated compounds: {fluorinated_compounds:,}")
    print(f"Percentage fluorinated: {fluorinated_percentage:.2f}%")

    # Fluorine count distribution
    fluorine_counts = (
        opencd_df[opencd_df["is_fluorinated"]]["fluorine_count"]
        .value_counts()
        .sort_index()
    )
    print(f"\nFluorine atom count distribution:")
    for count, freq in fluorine_counts.items():
        print(f"  {count} F atoms: {freq} compounds")

    # Get some examples of fluorinated compounds
    fluorinated_examples = opencd_df[opencd_df["is_fluorinated"]].head(10)
    print(f"\nExamples of fluorinated guest compounds:")
    for idx, row in fluorinated_examples.iterrows():
        smiles = row[guest_smiles_col]
        f_count = row["fluorine_count"]
        patterns = row["fluorine_patterns"]
        print(f"  {smiles} (F={f_count}, patterns={patterns})")
else:
    print("Cannot proceed without guest SMILES column")

Using column: IsomericSMILES

=== OpenCycloDB Fluorinated Compound Analysis ===
Total guest compounds: 3,459
Fluorinated compounds: 187
Percentage fluorinated: 5.41%

Fluorine atom count distribution:
  1 F atoms: 102 compounds
  2 F atoms: 15 compounds
  3 F atoms: 39 compounds
  4 F atoms: 3 compounds
  5 F atoms: 1 compounds
  6 F atoms: 1 compounds
  7 F atoms: 6 compounds
  9 F atoms: 1 compounds
  13 F atoms: 6 compounds
  15 F atoms: 7 compounds
  17 F atoms: 6 compounds

Examples of fluorinated guest compounds:
  OC1=CC=C(F)C=C1 (F=1, patterns=[])
  CC(C(=O)O)C1=CC=C(C2=CC=CC=C2)C(F)=C1 (F=1, patterns=[])
  O=P([O-])([O-])F (F=1, patterns=[])
  O=S(=O)([O-])C(F)(F)F (F=3, patterns=['CF3', 'CF2', 'C-F'])
  FC1=CC=CC=C1 (F=1, patterns=[])
  OC1=CC=C(F)C=C1 (F=1, patterns=[])
  CC(=O)OC1=CC=CC(F)=C1 (F=1, patterns=[])
  CC(=O)OC1=CC=CC(F)=C1 (F=1, patterns=[])
  FC1=CC=CC=C1 (F=1, patterns=[])
  OC1=CC=C(F)C=C1 (F=1, patterns=[])

Examples of fluorinated guest compounds:
  OC1=CC=

In [7]:
# Count unique fluorinated compounds in OpenCycloDB
if guest_smiles_col:
    unique_fluorinated = (
        opencd_df[opencd_df["is_fluorinated"]][guest_smiles_col].dropna().unique()
    )
    num_unique_fluorinated = len(unique_fluorinated)
    print(
        f"Number of unique fluorinated compounds in OpenCycloDB: {num_unique_fluorinated}"
    )
else:
    print("Guest SMILES column not found in OpenCycloDB dataset.")

Number of unique fluorinated compounds in OpenCycloDB: 96


## 2. Analysis of External Validation Datasets

Now let's examine the external validation datasets (CD validation and PFAS validation) to understand their fluorinated compound content.

In [8]:
# Load external validation datasets
validation_datasets = {}

# CD Validation dataset
cd_val_path = external_dir / "cd_val/cd_val_canonical.csv"
if cd_val_path.exists():
    cd_val_df = pd.read_csv(cd_val_path)
    validation_datasets["CD_val"] = cd_val_df
    print(f"Loaded CD validation set: {len(cd_val_df)} complexes")
    print(f"CD val columns: {list(cd_val_df.columns)}")
else:
    print(f"CD validation file not found: {cd_val_path}")

# PFAS Validation dataset
pfas_val_path = external_dir / "pfas_val/pfas_val_canonical.csv"
if pfas_val_path.exists():
    pfas_val_df = pd.read_csv(pfas_val_path)
    validation_datasets["PFAS_val"] = pfas_val_df
    print(f"Loaded PFAS validation set: {len(pfas_val_df)} complexes")
    print(f"PFAS val columns: {list(pfas_val_df.columns)}")
else:
    print(f"PFAS validation file not found: {pfas_val_path}")

print(f"\nLoaded {len(validation_datasets)} validation datasets")

# Show sample data
for name, df in validation_datasets.items():
    print(f"\n{name} sample:")
    print(df.head(2))

Loaded CD validation set: 42 complexes
CD val columns: ['CD', 'PFAS', 'pH', 'T', 'Host_SMILES', 'Guest_SMILES', 'delG']
Loaded PFAS validation set: 21 complexes
PFAS val columns: ['PFAS', 'CD', 'Host_SMILES', 'Guest_SMILES', 'delG']

Loaded 2 validation datasets

CD_val sample:
        CD  PFAS  pH    T                                        Host_SMILES  \
0  beta-CD  PFOA   7  293  OC[C@H]1O[C@@H]2O[C@H]3[C@H](O)[C@@H](O)[C@@H]...   
1  beta-CD  PFOS   7  293  OC[C@H]1O[C@@H]2O[C@H]3[C@H](O)[C@@H](O)[C@@H]...   

                                        Guest_SMILES       delG  
0  O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(... -25.746957  
1  O=S(=O)(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C... -23.095505  

PFAS_val sample:
      PFAS        CD                                        Host_SMILES  \
0  PFMOPrA  alpha-CD  OC[C@H]1O[C@@H]2O[C@H]3[C@H](O)[C@@H](O)[C@@H]...   
1  PFMOPrA   beta-CD  OC[C@H]1O[C@@H]2O[C@H]3[C@H](O)[C@@H](O)[C@@H]...   

                    Guest_SMILES      

In [9]:
# Analyze fluorinated compounds in validation datasets
validation_analysis = {}

for dataset_name, df in validation_datasets.items():
    print(f"\n=== {dataset_name} Fluorinated Compound Analysis ===")

    # Find guest SMILES column
    guest_smiles_col = None
    possible_cols = [
        "IsomericSMILES",
        "Guest_SMILES",
        "guest_smiles",
        "Guest_Canonical_SMILES",
        "guest_canonical_smiles",
    ]

    for col in possible_cols:
        if col in df.columns:
            guest_smiles_col = col
            break

    if not guest_smiles_col:
        # Look for any column with SMILES that might be guest data
        smiles_cols = [
            col
            for col in df.columns
            if "smiles" in col.lower() and "guest" in col.lower()
        ]
        if smiles_cols:
            guest_smiles_col = smiles_cols[0]

    if guest_smiles_col:
        print(f"Using guest SMILES column: {guest_smiles_col}")

        # Apply fluorine analysis
        df["is_fluorinated"] = df[guest_smiles_col].apply(is_fluorinated)
        df["fluorine_count"] = df[guest_smiles_col].apply(count_fluorine_atoms)
        df["fluorine_patterns"] = df[guest_smiles_col].apply(get_fluorine_patterns)

        # Statistics
        total_compounds = len(df)
        fluorinated_compounds = df["is_fluorinated"].sum()
        fluorinated_percentage = (fluorinated_compounds / total_compounds) * 100

        print(f"Total guest compounds: {total_compounds:,}")
        print(f"Fluorinated compounds: {fluorinated_compounds:,}")
        print(f"Percentage fluorinated: {fluorinated_percentage:.2f}%")

        # Store analysis results
        validation_analysis[dataset_name] = {
            "total": total_compounds,
            "fluorinated": fluorinated_compounds,
            "percentage": fluorinated_percentage,
            "dataframe": df,
            "guest_smiles_col": guest_smiles_col,
        }

        # Show some examples
        if fluorinated_compounds > 0:
            fluorinated_examples = df[df["is_fluorinated"]].head(5)
            print(f"\nExamples of fluorinated compounds:")
            for idx, row in fluorinated_examples.iterrows():
                smiles = row[guest_smiles_col]
                f_count = row["fluorine_count"]
                patterns = row["fluorine_patterns"]
                # Try to get compound name if available
                name = ""
                if "Guest_Name" in row:
                    name = f" ({row['Guest_Name']})"
                elif "guest_name" in row:
                    name = f" ({row['guest_name']})"
                print(f"  {smiles}{name} (F={f_count}, patterns={patterns})")

    else:
        print(f"No guest SMILES column found in {dataset_name}")
        print(f"Available columns: {list(df.columns)}")

print(f"\n=== Summary of Validation Dataset Analysis ===")
for name, analysis in validation_analysis.items():
    print(
        f"{name}: {analysis['fluorinated']}/{analysis['total']} ({analysis['percentage']:.1f}%) fluorinated"
    )


=== CD_val Fluorinated Compound Analysis ===
Using guest SMILES column: Guest_SMILES
Total guest compounds: 42
Fluorinated compounds: 42
Percentage fluorinated: 100.00%

Examples of fluorinated compounds:
  O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F (F=15, patterns=['CF3', 'CF2', 'C-F'])
  O=S(=O)(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F (F=17, patterns=['CF3', 'CF2', 'C-F'])
  O=C(O)C(F)(OC(F)(F)C(F)(F)C(F)(F)F)C(F)(F)F (F=11, patterns=['CF3', 'CF2', 'C-F'])
  O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F (F=9, patterns=['CF3', 'CF2', 'C-F'])
  O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F (F=15, patterns=['CF3', 'CF2', 'C-F'])

=== PFAS_val Fluorinated Compound Analysis ===
Using guest SMILES column: Guest_SMILES
Total guest compounds: 21
Fluorinated compounds: 21
Percentage fluorinated: 100.00%

Examples of fluorinated compounds:
  O=C(O)C(F)(F)C(F)(F)OC(F)(F)F (F=7, patterns=['CF3', 'CF2', 'C-F', 'OCF3'])
  O=C(O)C(F)(F)C(F)(F)OC(F)(F)F (F=7, patt

## 3. Overlap Analysis: Training vs Validation Fluorinated Compounds

Now let's examine if any fluorinated compounds appear in both the training (OpenCycloDB) and validation datasets. This is important for understanding potential data leakage and model performance evaluation.

In [10]:
# Overlap analysis between OpenCycloDB and validation sets
if guest_smiles_col and validation_analysis:
    # Get fluorinated compounds from OpenCycloDB
    opencd_fluorinated = set(
        opencd_df[opencd_df["is_fluorinated"]]["IsomericSMILES"].dropna()
    )
    print(f"OpenCycloDB fluorinated compounds: {len(opencd_fluorinated)}")

    # Analyze overlap with each validation set
    overlap_results = {}

    for val_name, val_analysis in validation_analysis.items():
        val_df = val_analysis["dataframe"]
        val_smiles_col = val_analysis["guest_smiles_col"]

        # Get fluorinated compounds from validation set
        val_fluorinated = set(val_df[val_df["is_fluorinated"]][val_smiles_col].dropna())

        # Find overlaps
        overlap_compounds = opencd_fluorinated.intersection(val_fluorinated)
        overlap_count = len(overlap_compounds)

        # Calculate percentages
        overlap_pct_training = (
            (overlap_count / len(opencd_fluorinated)) * 100 if opencd_fluorinated else 0
        )
        overlap_pct_validation = (
            (overlap_count / len(val_fluorinated)) * 100 if val_fluorinated else 0
        )

        overlap_results[val_name] = {
            "validation_fluorinated_count": len(val_fluorinated),
            "overlap_count": overlap_count,
            "overlap_compounds": overlap_compounds,
            "overlap_pct_training": overlap_pct_training,
            "overlap_pct_validation": overlap_pct_validation,
        }

        print(f"\n=== {val_name} Overlap Analysis ===")
        print(f"Validation fluorinated compounds: {len(val_fluorinated)}")
        print(f"Overlapping compounds: {overlap_count}")
        print(f"Overlap as % of training fluorinated: {overlap_pct_training:.2f}%")
        print(f"Overlap as % of validation fluorinated: {overlap_pct_validation:.2f}%")

        # Show overlapping compounds
        if overlap_count > 0:
            print(f"\nOverlapping fluorinated compounds:")
            for i, smiles in enumerate(sorted(overlap_compounds)):
                if i < 10:  # Show first 10
                    # Try to get compound names from both datasets
                    opencd_name = ""
                    val_name_str = ""

                    # From OpenCycloDB
                    opencd_match = opencd_df[opencd_df["IsomericSMILES"] == smiles]
                    if not opencd_match.empty and "Guest_Name" in opencd_match.columns:
                        opencd_name = opencd_match.iloc[0]["Guest_Name"]

                    # From validation set
                    val_match = val_df[val_df[val_smiles_col] == smiles]
                    if not val_match.empty:
                        for name_col in ["Guest_Name", "guest_name", "Name"]:
                            if name_col in val_match.columns:
                                val_name_str = val_match.iloc[0][name_col]
                                break

                    print(f"  {smiles}")
                    if opencd_name:
                        print(f"    OpenCycloDB: {opencd_name}")
                    if val_name_str:
                        print(f"    {val_name}: {val_name_str}")
                elif i == 10:
                    print(f"  ... and {overlap_count - 10} more compounds")
                    break
        else:
            print("\nNo overlapping fluorinated compounds found!")

    # Summary table
    print(f"\n=== Overall Overlap Summary ===")
    print(
        f"{'Dataset':<15} {'Val F-compounds':<15} {'Overlaps':<10} {'% of Training':<15} {'% of Validation':<15}"
    )
    print("-" * 75)

    for val_name, results in overlap_results.items():
        print(
            f"{val_name:<15} {results['validation_fluorinated_count']:<15} {results['overlap_count']:<10} "
            f"{results['overlap_pct_training']:<15.2f} {results['overlap_pct_validation']:<15.2f}"
        )

else:
    print("Cannot perform overlap analysis - missing required data")

OpenCycloDB fluorinated compounds: 96

=== CD_val Overlap Analysis ===
Validation fluorinated compounds: 4
Overlapping compounds: 1
Overlap as % of training fluorinated: 1.04%
Overlap as % of validation fluorinated: 25.00%

Overlapping fluorinated compounds:
  O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F

=== PFAS_val Overlap Analysis ===
Validation fluorinated compounds: 11
Overlapping compounds: 1
Overlap as % of training fluorinated: 1.04%
Overlap as % of validation fluorinated: 9.09%

Overlapping fluorinated compounds:
  O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F

=== Overall Overlap Summary ===
Dataset         Val F-compounds Overlaps   % of Training   % of Validation
---------------------------------------------------------------------------
CD_val          4               1          1.04            25.00          
PFAS_val        11              1          1.04            9.09           


## 5. Beta-Cyclodextrin Derivatives Overlap Analysis

The CD validation set consists exclusively of beta-cyclodextrin (β-CD) and its derivatives. Let's analyze the overlap of these host molecules between the OpenCycloDB training set and the CD validation set to understand if the validation set contains novel β-CD derivatives or reuses the same ones from training.

In [20]:
# Function to identify beta-cyclodextrin and its derivatives
def is_beta_cd_derivative(host_name):
    """
    Check if a host molecule is beta-cyclodextrin or a derivative.
    Returns True if it's beta-CD or a derivative, False otherwise.
    """
    if pd.isna(host_name) or not isinstance(host_name, str):
        return False

    host_lower = host_name.lower()

    # Check for beta-cyclodextrin patterns
    beta_patterns = [
        "beta-cyclodextrin",
        "β-cyclodextrin",
        "b-cyclodextrin",
        "betacd",
        "β-cd",
        "b-cd",
        "beta cd",
    ]

    # Check if it's explicitly beta-CD or a derivative
    for pattern in beta_patterns:
        if pattern in host_lower:
            return True

    # Also check for common beta-CD derivatives
    beta_derivatives = [
        "hp-beta-cd",
        "hpbcd",
        "hydroxypropyl-beta",
        "methyl-beta-cd",
        "sulfobutyl",
        "sbe-beta",
    ]

    for derivative in beta_derivatives:
        if derivative in host_lower:
            return True

    return False


def is_beta_cd_smiles(smiles_str):
    """
    Check if a SMILES string represents beta-cyclodextrin or a derivative.
    Beta-CD has 7 glucose units in a ring structure.
    This function looks for characteristic patterns in the SMILES.
    """
    if pd.isna(smiles_str) or not isinstance(smiles_str, str):
        return False

    # Beta-CD has 7 glucose units, so we expect to see patterns that indicate this
    # Look for the characteristic ring structure with multiple glucose units

    # Count the number of ring oxygen atoms (characteristic of cyclodextrin)
    # Beta-CD typically has patterns like: O1C...C...C...C...C...O...C1 repeated 7 times

    # Simple heuristic: Check if it's a large cyclic molecule with multiple glucose-like units
    # Native beta-CD SMILES typically contains:
    # - Multiple C1CCCCC1 or similar ring patterns (for glucose rings)
    # - Multiple O atoms connecting the rings
    # - Overall large molecular structure

    # For a more robust check, count characteristic features:
    num_oxygen = smiles_str.count("O")
    num_carbon = smiles_str.count("C")

    # Beta-CD has approximately:
    # - 7 glucose units (C6H10O5 each) = C42H70O35
    # - So we expect around 35-50 oxygens and 40-70 carbons for native or derivatives

    # This is a simplified check - for production use, you'd want to use RDKit for proper structure matching
    if (
        num_oxygen >= 20 and num_carbon >= 30
    ):  # Thresholds for cyclodextrin-like structures
        # Additional check: cyclodextrins tend to be large, complex molecules
        if len(smiles_str) > 100:  # Beta-CD SMILES are typically very long
            return True

    return False


def categorize_beta_cd(host_name):
    """
    Categorize beta-CD derivatives into specific types.
    """
    if pd.isna(host_name) or not isinstance(host_name, str):
        return "Unknown"

    host_lower = host_name.lower()

    # Native beta-CD
    if host_lower in [
        "beta-cyclodextrin",
        "β-cyclodextrin",
        "b-cyclodextrin",
        "betacd",
        "beta cd",
    ]:
        return "Native β-CD"

    # Hydroxypropyl derivatives
    if (
        "hydroxypropyl" in host_lower
        or "hp-beta" in host_lower
        or "hpbcd" in host_lower
    ):
        return "HP-β-CD (Hydroxypropyl)"

    # Sulfobutyl derivatives
    if "sulfobutyl" in host_lower or "sbe" in host_lower or "sb-beta" in host_lower:
        return "SBE-β-CD (Sulfobutyl)"

    # Methylated derivatives
    if "methyl" in host_lower and "beta" in host_lower:
        return "M-β-CD (Methylated)"

    # Other beta-CD derivatives
    if is_beta_cd_derivative(host_name):
        return "Other β-CD Derivative"

    return "Not β-CD"


def categorize_beta_cd_from_smiles(smiles_str):
    """
    Categorize beta-CD derivatives from SMILES structure.
    This is a simplified categorization - for precise identification, use RDKit.
    """
    if not is_beta_cd_smiles(smiles_str):
        return "Not β-CD"

    # Check for common functional groups in the SMILES
    smiles_lower = smiles_str.lower()

    # Sulfobutyl ether: contains S(=O)(=O) groups
    if "S(=O)(=O)" in smiles_str or "s(=o)(=o)" in smiles_lower:
        return "SBE-β-CD (Sulfobutyl)"

    # Hydroxypropyl: contains additional propyl chains with OH
    # Look for patterns like CCC(O) or CC(O)C
    if "CC(O)" in smiles_str or "CCC" in smiles_str:
        # Count to see if there are more than expected for native CD
        if smiles_str.count("CC") > 50:  # Native has fewer such patterns
            return "HP-β-CD (Hydroxypropyl)"

    # Methylated: Look for methyl groups (OC patterns beyond what native CD has)
    # This is tricky from SMILES alone without proper structure analysis

    # Default to native or derivative
    # If the molecule is relatively "clean" (no obvious substituents), likely native
    if len(smiles_str) < 300:  # Native beta-CD SMILES is typically 200-300 characters
        return "Native β-CD"
    else:
        return "β-CD Derivative (type unclear from SMILES)"


# Test the functions
print("Testing name-based beta-CD identification:")
test_hosts = [
    "beta-cyclodextrin",
    "alpha-cyclodextrin",
    "HP-beta-CD",
    "sulfobutylether-beta-cd",
    "methyl-beta-cyclodextrin",
    "gamma-cyclodextrin",
]

for host in test_hosts:
    is_beta = is_beta_cd_derivative(host)
    category = categorize_beta_cd(host)
    print(f"  {host}: is_beta={is_beta}, category={category}")

print("\n" + "=" * 70)
print("Note: For SMILES-based identification, we'll use is_beta_cd_smiles() function")
print("which checks for structural features characteristic of beta-cyclodextrin")
print("=" * 70)

Testing name-based beta-CD identification:
  beta-cyclodextrin: is_beta=True, category=Native β-CD
  alpha-cyclodextrin: is_beta=False, category=Not β-CD
  HP-beta-CD: is_beta=True, category=HP-β-CD (Hydroxypropyl)
  sulfobutylether-beta-cd: is_beta=True, category=SBE-β-CD (Sulfobutyl)
  methyl-beta-cyclodextrin: is_beta=True, category=M-β-CD (Methylated)
  gamma-cyclodextrin: is_beta=False, category=Not β-CD

Note: For SMILES-based identification, we'll use is_beta_cd_smiles() function
which checks for structural features characteristic of beta-cyclodextrin


In [21]:
# Analyze beta-CD derivatives in OpenCycloDB
print("=== OpenCycloDB Beta-CD Derivatives Analysis ===\n")

# Find host column in OpenCycloDB
host_col = None
host_smiles_col = None

# Look for host name column
for col in opencd_df.columns:
    if "host" in col.lower() and "smiles" not in col.lower():
        host_col = col
        break

# Also look for host SMILES column for comparison
for col in opencd_df.columns:
    if "host" in col.lower() and "smiles" in col.lower():
        host_smiles_col = col
        break

if host_col:
    print(f"Using host name column: {host_col}")
    if host_smiles_col:
        print(f"Also found host SMILES column: {host_smiles_col}")

    # Apply beta-CD analysis using name-based functions
    opencd_df["is_beta_cd"] = opencd_df[host_col].apply(is_beta_cd_derivative)
    opencd_df["beta_cd_category"] = opencd_df[host_col].apply(categorize_beta_cd)

    # If we have SMILES, also check using SMILES-based method for comparison
    if host_smiles_col:
        opencd_df["is_beta_cd_smiles"] = opencd_df[host_smiles_col].apply(
            is_beta_cd_smiles
        )
        print("\nComparing name-based vs SMILES-based identification:")
        name_based_count = opencd_df["is_beta_cd"].sum()
        smiles_based_count = opencd_df["is_beta_cd_smiles"].sum()
        print(f"  Name-based: {name_based_count:,} β-CD complexes")
        print(f"  SMILES-based: {smiles_based_count:,} β-CD complexes")

        # Use name-based as primary since we have proper chemical names
        print("\nUsing name-based identification as primary method")

    # Get statistics
    total_complexes = len(opencd_df)
    beta_cd_complexes = opencd_df["is_beta_cd"].sum()
    beta_cd_percentage = (beta_cd_complexes / total_complexes) * 100

    print(f"\nTotal complexes in OpenCycloDB: {total_complexes:,}")
    print(f"Beta-CD complexes: {beta_cd_complexes:,}")
    print(f"Percentage with β-CD: {beta_cd_percentage:.2f}%")

    # Get unique beta-CD derivatives
    beta_cd_hosts = opencd_df[opencd_df["is_beta_cd"]][host_col].unique()
    print(f"\nUnique β-CD derivatives in OpenCycloDB: {len(beta_cd_hosts)}")

    # Categorize beta-CD derivatives
    beta_cd_categories = opencd_df[opencd_df["is_beta_cd"]][
        "beta_cd_category"
    ].value_counts()
    print(f"\nβ-CD derivative categories:")
    for category, count in beta_cd_categories.items():
        print(f"  {category}: {count:,} complexes")

    # Show all unique beta-CD derivatives
    print(f"\nAll unique β-CD derivatives in OpenCycloDB:")
    for i, host in enumerate(sorted(beta_cd_hosts), 1):
        category = categorize_beta_cd(host)
        count = len(opencd_df[opencd_df[host_col] == host])
        print(f"  {i}. {host} ({category}) - {count} complexes")

    # Store SMILES for overlap analysis if available
    if host_smiles_col:
        beta_cd_smiles_opencd = opencd_df[opencd_df["is_beta_cd"]][
            host_smiles_col
        ].unique()
        print(
            f"\nCorresponding unique β-CD SMILES structures: {len(beta_cd_smiles_opencd)}"
        )
else:
    print("No host column found in OpenCycloDB data")
    print(f"Available columns: {list(opencd_df.columns)}")

=== OpenCycloDB Beta-CD Derivatives Analysis ===

Using host name column: Host
Also found host SMILES column: IsomericSMILES_Host

Comparing name-based vs SMILES-based identification:
  Name-based: 1,988 β-CD complexes
  SMILES-based: 3,459 β-CD complexes

Using name-based identification as primary method

Total complexes in OpenCycloDB: 3,459
Beta-CD complexes: 1,988
Percentage with β-CD: 57.47%

Unique β-CD derivatives in OpenCycloDB: 10

β-CD derivative categories:
  Native β-CD: 1,434 complexes
  HP-β-CD (Hydroxypropyl): 190 complexes
  M-β-CD (Methylated): 189 complexes
  SBE-β-CD (Sulfobutyl): 135 complexes
  Other β-CD Derivative: 40 complexes

All unique β-CD derivatives in OpenCycloDB:
  1. 2,6-Di-O-methyl-beta-cyclodextrin (M-β-CD (Methylated)) - 47 complexes
  2. Acetyl-beta-cyclodextrin, Monoacetyl-beta-cyclodextrin (Other β-CD Derivative) - 19 complexes
  3. Succinyl-beta-cyclodextrin (Other β-CD Derivative) - 5 complexes
  4. Trimethyl-beta-cyclodextrin (M-β-CD (Methylate

In [22]:
# Analyze beta-CD derivatives in CD validation set
print("=== CD Validation Set Beta-CD Derivatives Analysis ===\n")

if "CD_val" in validation_datasets:
    cd_val_df = validation_datasets["CD_val"]

    # Host column is Host_SMILES (contains SMILES strings, not names)
    host_col_val = "Host_SMILES"

    if host_col_val in cd_val_df.columns:
        print(f"Using host column: {host_col_val}")
        print(
            "NOTE: This column contains SMILES strings, so using SMILES-based identification\n"
        )

        # Apply beta-CD analysis using SMILES-based functions
        cd_val_df["is_beta_cd"] = cd_val_df[host_col_val].apply(is_beta_cd_smiles)
        cd_val_df["beta_cd_category"] = cd_val_df[host_col_val].apply(
            categorize_beta_cd_from_smiles
        )

        # Get statistics
        total_complexes_val = len(cd_val_df)
        beta_cd_complexes_val = cd_val_df["is_beta_cd"].sum()
        beta_cd_percentage_val = (beta_cd_complexes_val / total_complexes_val) * 100

        print(f"Total complexes in CD validation: {total_complexes_val:,}")
        print(f"Beta-CD complexes (identified by SMILES): {beta_cd_complexes_val:,}")
        print(f"Percentage with β-CD: {beta_cd_percentage_val:.2f}%")

        # Get unique beta-CD derivatives (SMILES)
        beta_cd_hosts_val = cd_val_df[cd_val_df["is_beta_cd"]][host_col_val].unique()
        print(
            f"\nUnique β-CD derivative SMILES in CD validation: {len(beta_cd_hosts_val)}"
        )

        # Categorize beta-CD derivatives
        beta_cd_categories_val = cd_val_df[cd_val_df["is_beta_cd"]][
            "beta_cd_category"
        ].value_counts()
        print(f"\nβ-CD derivative categories (from SMILES analysis):")
        for category, count in beta_cd_categories_val.items():
            print(f"  {category}: {count:,} complexes")

        # Show all unique beta-CD derivatives with truncated SMILES
        print(f"\nAll unique β-CD derivative SMILES in CD validation:")
        for i, host_smiles in enumerate(sorted(beta_cd_hosts_val), 1):
            category = categorize_beta_cd_from_smiles(host_smiles)
            count = len(cd_val_df[cd_val_df[host_col_val] == host_smiles])
            # Truncate SMILES for display
            smiles_display = (
                host_smiles[:60] + "..." if len(host_smiles) > 60 else host_smiles
            )
            print(f"  {i}. {smiles_display}")
            print(f"      Category: {category} - {count} complexes")
    else:
        print(f"Host_SMILES column not found in CD validation data")
        print(f"Available columns: {list(cd_val_df.columns)}")
else:
    print("CD validation dataset not loaded")

=== CD Validation Set Beta-CD Derivatives Analysis ===

Using host column: Host_SMILES
NOTE: This column contains SMILES strings, so using SMILES-based identification

Total complexes in CD validation: 42
Beta-CD complexes (identified by SMILES): 42
Percentage with β-CD: 100.00%

Unique β-CD derivative SMILES in CD validation: 6

β-CD derivative categories (from SMILES analysis):
  Native β-CD: 34 complexes
  β-CD Derivative (type unclear from SMILES): 8 complexes

All unique β-CD derivative SMILES in CD validation:
  1. NC[C@H]1O[C@@H]2O[C@H]3[C@H](O)[C@@H](O)[C@@H](O[C@H]4[C@H](...
      Category: Native β-CD - 8 complexes
  2. OC[C@H]1O[C@@H]2O[C@H]3[C@H](O)[C@@H](O)[C@@H](O[C@H]4[C@H](...
      Category: β-CD Derivative (type unclear from SMILES) - 8 complexes
  3. OC[C@H]1O[C@@H]2O[C@H]3[C@H](O)[C@@H](O)[C@@H](O[C@H]4[C@H](...
      Category: Native β-CD - 8 complexes
  4. OC[C@H]1O[C@@H]2O[C@H]3[C@H](O)[C@@H](O)[C@@H](O[C@H]4[C@H](...
      Category: Native β-CD - 8 complexes
  5

In [23]:
# Overlap analysis: Beta-CD derivatives between training and validation
print("=== Beta-CD Derivatives Overlap Analysis ===\n")
print("NOTE: Comparing SMILES structures since validation set uses SMILES\n")

if host_smiles_col and host_col_val and "CD_val" in validation_datasets:
    # Get beta-CD SMILES from OpenCycloDB (where we identified beta-CD by name)
    opencd_beta_cd_smiles = set(
        opencd_df[opencd_df["is_beta_cd"]][host_smiles_col].dropna()
    )

    # Get beta-CD SMILES from CD validation (identified by SMILES structure)
    cd_val_beta_cd_smiles = set(
        cd_val_df[cd_val_df["is_beta_cd"]][host_col_val].dropna()
    )

    print(f"OpenCycloDB β-CD SMILES structures: {len(opencd_beta_cd_smiles)}")
    print(f"CD validation β-CD SMILES structures: {len(cd_val_beta_cd_smiles)}")

    # Find overlapping SMILES structures
    overlapping_beta_cd_smiles = opencd_beta_cd_smiles.intersection(
        cd_val_beta_cd_smiles
    )
    unique_to_training_smiles = opencd_beta_cd_smiles - cd_val_beta_cd_smiles
    unique_to_validation_smiles = cd_val_beta_cd_smiles - opencd_beta_cd_smiles

    print(f"\nOverlapping β-CD SMILES structures: {len(overlapping_beta_cd_smiles)}")
    print(f"Unique to training: {len(unique_to_training_smiles)}")
    print(f"Unique to validation: {len(unique_to_validation_smiles)}")

    # Calculate percentages
    overlap_pct_training = (
        (len(overlapping_beta_cd_smiles) / len(opencd_beta_cd_smiles)) * 100
        if opencd_beta_cd_smiles
        else 0
    )
    overlap_pct_validation = (
        (len(overlapping_beta_cd_smiles) / len(cd_val_beta_cd_smiles)) * 100
        if cd_val_beta_cd_smiles
        else 0
    )

    print(f"\nOverlap as % of training β-CD structures: {overlap_pct_training:.2f}%")
    print(f"Overlap as % of validation β-CD structures: {overlap_pct_validation:.2f}%")

    # Show overlapping structures with their names
    if overlapping_beta_cd_smiles:
        print(f"\nOverlapping β-CD structures:")
        for i, smiles in enumerate(sorted(overlapping_beta_cd_smiles), 1):
            # Get name from training data
            train_match = opencd_df[opencd_df[host_smiles_col] == smiles]
            if not train_match.empty and host_col:
                host_name = train_match.iloc[0][host_col]
                category = categorize_beta_cd(host_name)
            else:
                host_name = "Unknown"
                category = categorize_beta_cd_from_smiles(smiles)

            train_count = len(opencd_df[opencd_df[host_smiles_col] == smiles])
            val_count = len(cd_val_df[cd_val_df[host_col_val] == smiles])

            smiles_display = smiles[:60] + "..." if len(smiles) > 60 else smiles
            print(f"  {i}. {host_name} ({category})")
            print(f"      SMILES: {smiles_display}")
            print(
                f"      Training: {train_count} complexes | Validation: {val_count} complexes"
            )

    # Show unique structures
    if unique_to_training_smiles:
        print(f"\nβ-CD structures ONLY in training (not in validation):")
        for i, smiles in enumerate(sorted(unique_to_training_smiles), 1):
            if i <= 5:  # Show first 5
                train_match = opencd_df[opencd_df[host_smiles_col] == smiles]
                if not train_match.empty and host_col:
                    host_name = train_match.iloc[0][host_col]
                    category = categorize_beta_cd(host_name)
                else:
                    host_name = "Unknown"
                    category = "Unknown"

                count = len(opencd_df[opencd_df[host_smiles_col] == smiles])
                smiles_display = smiles[:60] + "..." if len(smiles) > 60 else smiles
                print(f"  {i}. {host_name} ({category}) - {count} complexes")
                print(f"      SMILES: {smiles_display}")
            elif i == 6:
                print(f"  ... and {len(unique_to_training_smiles) - 5} more structures")
                break

    if unique_to_validation_smiles:
        print(f"\nβ-CD structures ONLY in validation (not in training):")
        for i, smiles in enumerate(sorted(unique_to_validation_smiles), 1):
            if i <= 5:  # Show first 5
                category = categorize_beta_cd_from_smiles(smiles)
                count = len(cd_val_df[cd_val_df[host_col_val] == smiles])
                smiles_display = smiles[:60] + "..." if len(smiles) > 60 else smiles
                print(f"  {i}. {category} - {count} complexes")
                print(f"      SMILES: {smiles_display}")
            elif i == 6:
                print(
                    f"  ... and {len(unique_to_validation_smiles) - 5} more structures"
                )
                break

    # Summary
    print(f"\n{'='*70}")
    print(f"SUMMARY:")
    print(f"  • Total unique β-CD SMILES in training: {len(opencd_beta_cd_smiles)}")
    print(f"  • Total unique β-CD SMILES in validation: {len(cd_val_beta_cd_smiles)}")
    print(f"  • Overlapping structures: {len(overlapping_beta_cd_smiles)}")
    print(f"  • Novel structures in validation: {len(unique_to_validation_smiles)}")
    print(f"{'='*70}")

    # Store for visualization
    overlapping_beta_cd = overlapping_beta_cd_smiles
    unique_to_training = unique_to_training_smiles
    unique_to_validation = unique_to_validation_smiles
    opencd_beta_cd_hosts = opencd_beta_cd_smiles
    cd_val_beta_cd_hosts = cd_val_beta_cd_smiles

else:
    print("Cannot perform overlap analysis - missing required data")
    print(
        f"host_smiles_col: {host_smiles_col if 'host_smiles_col' in locals() else 'Not found'}"
    )
    print(
        f"host_col_val: {host_col_val if 'host_col_val' in locals() else 'Not found'}"
    )

=== Beta-CD Derivatives Overlap Analysis ===

NOTE: Comparing SMILES structures since validation set uses SMILES

OpenCycloDB β-CD SMILES structures: 10
CD validation β-CD SMILES structures: 6

Overlapping β-CD SMILES structures: 0
Unique to training: 10
Unique to validation: 6

Overlap as % of training β-CD structures: 0.00%
Overlap as % of validation β-CD structures: 0.00%

β-CD structures ONLY in training (not in validation):
  1. Succinyl-beta-cyclodextrin (Other β-CD Derivative) - 5 complexes
      SMILES: CC(=O)CCC(=O)OC[C@H]1O[C@@H]2O[C@H]3[C@@H](O)[C@H](O)[C@@H](...
  2. Acetyl-beta-cyclodextrin, Monoacetyl-beta-cyclodextrin (Other β-CD Derivative) - 19 complexes
      SMILES: CC(=O)OC[C@H]1O[C@@H]2O[C@@H]3[C@@H](CO)O[C@H](O[C@@H]4[C@@H...
  3. hp-beta-cd (HP-β-CD (Hydroxypropyl)) - 190 complexes
      SMILES: CC(O)COC[C@H]1O[C@@H]2O[C@H]3[C@@H](O)[C@H](O)[C@@H](O[C@H]4...
  4. 2,6-Di-O-methyl-beta-cyclodextrin (M-β-CD (Methylated)) - 47 complexes
      SMILES: COC[C@H]1O[C@@H]