In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
from rdkit import Chem
from sklearn.model_selection import train_test_split

# Define the canonical SMILES conversion function
def canon_smiles(smiles):
    """Convert a SMILES string into its canonical form using RDKit."""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
        else:
            return float('nan')
    except:
        return float('nan')

cyp_types = ['3A4','2C9']
for cyp in cyp_types:
        # Step 1: Load and preprocess nx_dataset
        print("Loading nx_dataset...")
        current_directory = os.getcwd()
        admetica_folder = os.path.abspath(os.path.join(current_directory, '..'))
        nx_path = os.path.join(current_directory, 'novartis/novartis_data.csv')
        nx_dataset = pd.read_csv(nx_path)
        print(f"nx_dataset columns: {nx_dataset.columns.tolist()}")
        
        # Apply the canonical SMILES conversion to nx_dataset
        print("Converting SMILES to canonical form in nx_dataset...")
        nx_dataset['smiles'] = nx_dataset['smiles'].apply(canon_smiles)
        
        # Step 2: Load and preprocess admetica_dataset
        print("\nLoading admetica_dataset...")
        admetica_dataset = pd.read_csv(f'{admetica_folder}/ADMET/metabolism/cyp{cyp.lower()}-inhibitor/cyp{cyp.lower()}-inhibitor_curated.csv')
        print(f"admetica_dataset columns: {admetica_dataset.columns.tolist()}")
        
        # Apply the canonical SMILES conversion to admetica_dataset
        print("Converting SMILES to canonical form in admetica_dataset...")
        admetica_dataset['smiles'] = admetica_dataset['smiles'].apply(canon_smiles)
        
        # Step 3: Remove rows from nx_dataset where 'smiles' exist in admetica_dataset
        print(f"\nInitial nx_dataset shape: {nx_dataset.shape}")
        nx_dataset = nx_dataset[~nx_dataset['smiles'].isin(admetica_dataset['smiles'])]
        print(f"nx_dataset shape after removing overlapping SMILES with admetica_dataset: {nx_dataset.shape}")
        # Step 4: Create a balanced dataset based on 'CYP2C9_class'
        print(f"\nBalancing dataset by 'CYP{cyp}_class'...")
        
        # Split data into class 0 and class 1
        class_0 = nx_dataset[nx_dataset[f'CYP{cyp}_class'] == 0]
        class_1 = nx_dataset[nx_dataset[f'CYP{cyp}_class'] == 1]
        
        # Randomly sample instances from class 0 to match class 1
        class_0_sample = class_0.sample(n=len(class_1), random_state=42)
        
        # Create a balanced dataset
        balanced_data = pd.concat([class_0_sample, class_1])
        
        # Check class distribution
        print("Balanced class distribution:")
        print(balanced_data[f'CYP{cyp}_class'].value_counts())
        
        # Step 5: Keep only relevant columns
        print(f"\nKeeping only 'smiles' and 'CYP{cyp}_class' columns in balanced_data...")
        balanced_data = balanced_data[['smiles', f'CYP{cyp}_class']]
        
        # Step 6: Clean and rename columns in admetica_dataset
        admetica_dataset = admetica_dataset.drop_duplicates(subset='smiles')
        print(f"\nRenaming 'Activity' column to 'CYP{cyp}_class' in admetica_dataset...")
        admetica_dataset = admetica_dataset.rename(columns={'Activity': f'CYP{cyp}_class'})
        
        # Step 7: Concatenate the two datasets vertically
        print("\nCombining balanced_data and admetica_dataset...")
        combined_data = pd.concat([balanced_data, admetica_dataset], ignore_index=True)
        
        # Step 8: Save the combined dataset to CSV
        output_file = f'{admetica_folder}/ADMET/metabolism/cyp{cyp.lower()}-inhibitor/balanced_admetica_{cyp}.csv'
        combined_data.to_csv(output_file, index=False)
        print(f"\nCombined dataset saved to {output_file}")
        print(f"Final combined dataset shape: {combined_data.shape}")

Loading nx_dataset...
nx_dataset columns: ['smiles', 'pred(Caco-2_LogPapp)', 'pred(CYP3A4_pIC50)', 'pred(CYP2C9_pIC50)', 'pred(CYP2D6_pIC50)', 'pred(CYP3A4_pIC50)_log', 'pred(CYP2C9_pIC50)_log', 'pred(CYP2D6_pIC50)_log', 'CYP3A4_class', 'CYP2C9_class', 'CYP2D6_class']
Converting SMILES to canonical form in nx_dataset...

Loading admetica_dataset...
admetica_dataset columns: ['smiles', 'Activity']
Converting SMILES to canonical form in admetica_dataset...

Initial nx_dataset shape: (273638, 11)
nx_dataset shape after removing overlapping SMILES with admetica_dataset: (273402, 11)

Balancing dataset by 'CYP3A4_class'...
Balanced class distribution:
CYP3A4_class
0    22618
1    22618
Name: count, dtype: int64

Keeping only 'smiles' and 'CYP3A4_class' columns in balanced_data...

Renaming 'Activity' column to 'CYP3A4_class' in admetica_dataset...

Combining balanced_data and admetica_dataset...

Combined dataset saved to /Users/aleksashka/admetica/ADMET/metabolism/cyp3a4-inhibitor/balanced