<a href="https://colab.research.google.com/github/georgyzaouk/BS-Capstone/blob/main/smilescomp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pubchempy pandas openpyxl
!pip install rdkit
!pip install openpyxl



In [None]:
import pandas as pd
from pubchempy import get_compounds
from rdkit import Chem
import time
import openpyxl

# Load the data
LiverTox = pd.read_excel("/content/preLiverTox.xlsx")
DILIrank = pd.read_excel("/content/preDILIrank.xlsx")

# Function to validate the SMILES
def is_valid_smiles(smiles):
    return Chem.MolFromSmiles(smiles) is not None

# Function to get SMILES from PubChem
def get_smiles(drug_name, max_retries=3):
    for attempt in range(max_retries):
        try:
            compounds = get_compounds(drug_name, 'name')
            if compounds:
                smiles = compounds[0].canonical_smiles
                if is_valid_smiles(smiles):
                    return smiles
            return None
        except Exception as e:
            if 'ServerBusy' in str(e) and attempt < max_retries - 1:
                time.sleep(2)
            else:
                print(f"Error fetching {drug_name}: {e}")
                return None

# For LiverTox, use both ingredients and drug_brand
def resolve_smiles_livertox(row):
    # Try ingredients first
    smiles = get_smiles(row['Ingredient'])
    if not smiles or smiles == '':
        # If ingredients failed, try drug_brand
        smiles = get_smiles(row['Brand Name'])
    return smiles

# For DILIrank, only try the compound name (single column)
def resolve_smiles_dilirank(row):
    return get_smiles(row['Compound Name'])

# Apply the appropriate function to each dataset
LiverTox['SMILES'] = LiverTox.apply(resolve_smiles_livertox, axis=1)
DILIrank['SMILES'] = DILIrank.apply(resolve_smiles_dilirank, axis=1)

# Save both datasets with SMILES to Excel files
LiverTox.to_excel("/content/livertox_with_smiles.xlsx", index=False, engine='openpyxl')
DILIrank.to_excel("/content/dilirank_with_smiles.xlsx", index=False, engine='openpyxl')

print("✅ Done! SMILES saved for both datasets.")

✅ Done! SMILES saved for both datasets.
