<a href="https://colab.research.google.com/github/georgyzaouk/BS-Capstone/blob/main/smiles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
!pip install pubchempy pandas openpyxl
!pip install rdkit
!pip install openpyxl



In [32]:
# Imports
import pandas as pd
from pubchempy import get_compounds
from rdkit import Chem
import time
import requests

# Load datasets
LiverTox = pd.read_excel("/content/preLiverTox.xlsx")
DILIrank = pd.read_excel("/content/preDILIrank.xlsx")

# Validate SMILES
def is_valid_smiles(smiles):
    return Chem.MolFromSmiles(smiles) is not None

# Retrieve SMILES from PubChemPy
def get_smiles(drug_name, max_retries=3):
    for attempt in range(max_retries):
        try:
            compounds = get_compounds(drug_name, 'name')
            if compounds:
                smiles = compounds[0].canonical_smiles
                if is_valid_smiles(smiles):
                    return smiles
            return None
        except Exception as e:
            if 'ServerBusy' in str(e) and attempt < max_retries - 1:
                time.sleep(2)
            else:
                return None

# Fallback: use PubChem REST API (simulates web search)
def get_smiles_from_pubchem_web(name):
    try:
        search_api = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/cids/JSON"
        r = requests.get(search_api)
        if r.status_code == 200:
            data = r.json()
            if 'IdentifierList' in data and 'CID' in data['IdentifierList']:
                cid = data['IdentifierList']['CID'][0]
                smiles_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES/JSON"
                s = requests.get(smiles_url)
                if s.status_code == 200:
                    smiles_data = s.json()
                    return smiles_data['PropertyTable']['Properties'][0]['CanonicalSMILES']
    except Exception:
        pass
    return None

# Resolve SMILES for LiverTox
def resolve_smiles_livertox(row):
    name1 = str(row['Ingredient'])
    name2 = str(row['Brand Name'])
    smiles = get_smiles(name1)
    if not smiles:
        smiles = get_smiles(name2)
    if not smiles:
        print(f"[⚠️ Not Found] {name1} / {name2}")
    return smiles

# Resolve SMILES for DILIrank
def resolve_smiles_dilirank(row):
    name = str(row['Compound Name'])
    smiles = get_smiles(name)
    if not smiles:
        print(f"[⚠️ Not Found] {name}")
    return smiles

# Start SMILES resolution
print("\n🔄 Resolving SMILES for LiverTox...")
LiverTox['SMILES'] = LiverTox.apply(resolve_smiles_livertox, axis=1)

print("\n🔄 Resolving SMILES for DILIrank...")
DILIrank['SMILES'] = DILIrank.apply(resolve_smiles_dilirank, axis=1)

# Save only successful SMILES
LiverTox[LiverTox['SMILES'].notna()].to_excel("/content/livertox_with_smiles.xlsx", index=False, engine='openpyxl')
DILIrank[DILIrank['SMILES'].notna()].to_excel("/content/dilirank_with_smiles.xlsx", index=False, engine='openpyxl')

# Log failed compounds
missing_livertox = LiverTox[LiverTox['SMILES'].isna()][['Ingredient', 'Brand Name']].copy()
missing_dilirank = DILIrank[DILIrank['SMILES'].isna()][['Compound Name']].copy()

missing_livertox['Source'] = 'LiverTox'
missing_dilirank['Source'] = 'DILIrank'

missing_livertox['Name'] = missing_livertox['Ingredient'].fillna('') + " / " + missing_livertox['Brand Name'].fillna('')
missing_dilirank['Name'] = missing_dilirank['Compound Name']

missing_combined = pd.concat([
    missing_livertox[['Name', 'Source']],
    missing_dilirank[['Name', 'Source']]
], ignore_index=True)

missing_combined.to_excel("/content/missing_smiles_summary.xlsx", index=False)

# Second attempt via PubChem Web API
print("\n🔁 Retrying missing compounds using PubChem web API...")
recovered = []
for i, row in missing_combined.iterrows():
    name = row['Name'].split(" / ")[0]  # Use main name
    smiles = get_smiles_from_pubchem_web(name)
    if smiles:
        print(f"[✅ Recovered] {row['Name']}")
        recovered.append({
            'Name': row['Name'],
            'Source': row['Source'],
            'SMILES': smiles
        })

# Save recovered compounds
recovered_df = pd.DataFrame(recovered)
recovered_df.to_excel("/content/recovered_smiles_web.xlsx", index=False)

# ✅ Final report
print("\n✅ SMILES extraction complete!")
print(f"📉 Missing SMILES count:")
print(f" - LiverTox: {missing_livertox.shape[0]}")
print(f" - DILIrank: {missing_dilirank.shape[0]}")
print("📁 Valid SMILES saved in: /content/livertox_with_smiles.xlsx and /content/dilirank_with_smiles.xlsx")
print("📁 Missing compounds logged in: /content/missing_smiles_summary.xlsx")
print("📁 Recovered via web search in: /content/recovered_smiles_web.xlsx")



🔄 Resolving SMILES for LiverTox...
[⚠️ Not Found] Abatacept / Orencia
[⚠️ Not Found] Adalimumab / Humira
[⚠️ Not Found] Aducanumab / Aduhelm
[⚠️ Not Found] Aflibercept / Eylea
[⚠️ Not Found] Alemtuzumab / Campath
[⚠️ Not Found] Alglucerase / Ceredase
[⚠️ Not Found] Alglucosidase alfa / Lumizyme
[⚠️ Not Found] Alirocumab / Praluent
[⚠️ Not Found] Alpha-1 proteinase / Glassia
[⚠️ Not Found] Anakinra / Kineret
[⚠️ Not Found] Androgenic steroids / Group name
[⚠️ Not Found] Anifrolumab / Saphnelo
[⚠️ Not Found] Asparaginase / Eispar
[⚠️ Not Found] Atezolizumab / Tecentriq
[⚠️ Not Found] Atoltivimab / Inmazeb
[⚠️ Not Found] Avacincaptad pegol / Izervay
[⚠️ Not Found] Avelumab / Bavencio
[⚠️ Not Found] Axatilimab / Niktimvo
[⚠️ Not Found] Bamlanivimab / LY-CoV555
[⚠️ Not Found] Basiliximab / Simulect
[⚠️ Not Found] Belantamab-Mafodotin / Blenrep
[⚠️ Not Found] Belimumab / Benlysta
[⚠️ Not Found] Benralizumab / Fasenra
[⚠️ Not Found] Bevacizumab / Avastin
[⚠️ Not Found] Bezlotoxumab / Zinplav