### 주의 : 본 코드는 책에 대한 학습 및 교육외에 배포를 금지합니다.
### Warning: This code is prohibited from distribution except for learning and educational purposes related to the book.
2장 화학 정보의 탐색: Chemical Space와 빅데이터
- by Keunhong Jeong

![image.png](attachment:image.png)

In [1]:
!pip install rdkit



In [2]:
import requests

# PubChem에서 화합물 페이지에 접근
url = "https://pubchem.ncbi.nlm.nih.gov/compound/12345"
response = requests.get(url)

# HTML 데이터를 파일에 저장
with open("pubchem_page.txt", "w", encoding="utf-8") as file:
    file.write(response.text)


In [4]:
import requests

# PubChem CID
cid = 12345

# PubChem PUG REST API: CID로 MolecularWeight(분자량) 가져오기
api_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/MolecularWeight/JSON"

# API 요청
response = requests.get(api_url, timeout=30)
response.raise_for_status()

# JSON 파싱
data = response.json()

# 분자량 추출
try:
    mw = data["PropertyTable"]["Properties"][0]["MolecularWeight"]
    print("Molecular Weight:", mw)
except (KeyError, IndexError, TypeError):
    # CID가 없거나, 응답 구조가 예상과 다를 때
    print("Molecular Weight 정보를 찾을 수 없습니다. (CID가 유효한지 확인하세요)")


Molecular Weight: 132.11


In [4]:
import requests
import json

compound_name = "glucose"

# PubChem REST API 요청
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound_name}/property/MolecularFormula,MolecularWeight,CanonicalSMILES,InChI,InChIKey/JSON"

response = requests.get(url)
response.raise_for_status()  # HTTP 에러 체크

data = response.json()

properties = data['PropertyTable']['Properties'][0]

# 안전하게 값 가져오기 (.get 사용)
print('분자식:', properties.get('MolecularFormula', '정보 없음'))
print('분자량:', properties.get('MolecularWeight', '정보 없음'))
print('SMILES:', properties.get('ConnectivitySMILES', '정보 없음'))
print('InChI:', properties.get('InChI', '정보 없음'))
print('InChIKey:', properties.get('InChIKey', '정보 없음'))

분자식: C6H12O6
분자량: 180.16
SMILES: C(C1C(C(C(C(O1)O)O)O)O)O
InChI: InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1
InChIKey: WQZGKKKJIJFFOK-GASJEMHNSA-N


In [5]:
import requests
import json
import urllib.parse

# SMILES 문자열
smiles = "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O"

# SMILES는 URL에 넣기 전에 인코딩해야 안전함
encoded_smiles = urllib.parse.quote(smiles)

# PubChem REST API 요청
properties = "MolecularFormula,MolecularWeight,ConnectivitySMILES,InChI,InChIKey"
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{encoded_smiles}/property/{properties}/JSON"

response = requests.get(url)
response.raise_for_status()  # HTTP 에러 체크

data = response.json()

# 화합물의 정보 출력
try:
    properties_data = data.get('PropertyTable', {}).get('Properties', [{}])[0]

    print('분자식:', properties_data.get('MolecularFormula', '정보 없음'))
    print('분자량:', properties_data.get('MolecularWeight', '정보 없음'))
    print('Canonical SMILES:', properties_data.get('ConnectivitySMILES', '정보 없음'))
    print('InChI:', properties_data.get('InChI', '정보 없음'))
    print('InChIKey:', properties_data.get('InChIKey', '정보 없음'))

except (IndexError, TypeError):
    print("화합물 정보를 찾을 수 없습니다.")

분자식: C6H12O6
분자량: 180.16
Canonical SMILES: C(C1C(C(C(C(O1)O)O)O)O)O
InChI: InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1
InChIKey: WQZGKKKJIJFFOK-GASJEMHNSA-N


In [6]:
import requests
import json

compound_name = "ethanol"

# PubChem REST API 요청
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound_name}/property/MolecularFormula,MolecularWeight,CanonicalSMILES,InChI,InChIKey/JSON"
response = requests.get(url)

# JSON 데이터를 Python 사전으로 변환
data = json.loads(response.text)

# 데이터에서 화합물의 정보를 안전하게 추출
properties = data.get('PropertyTable', {}).get('Properties', [{}])[0]

# 화합물 정보 출력 (.get 사용)
print('분자식:', properties.get('MolecularFormula', '정보 없음'))
print('분자량:', properties.get('MolecularWeight', '정보 없음'))
print('ConnectivitySMILES:', properties.get('ConnectivitySMILES', '정보 없음'))
print('InChI:', properties.get('InChI', '정보 없음'))
print('InChIKey:', properties.get('InChIKey', '정보 없음'))

분자식: C2H6O
분자량: 46.07
ConnectivitySMILES: CCO
InChI: InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3
InChIKey: LFQSCWFLJHTTHZ-UHFFFAOYSA-N


In [8]:
!pip install pubchempy==1.0.4

Collecting pubchempy==1.0.4
  Using cached PubChemPy-1.0.4.tar.gz (29 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pubchempy
  Building wheel for pubchempy (setup.py): started
  Building wheel for pubchempy (setup.py): finished with status 'done'
  Created wheel for pubchempy: filename=PubChemPy-1.0.4-py3-none-any.whl size=13840 sha256=e4e1217740b72e2c627030b0a0a6db74d2b7f9901d43ed77becf99f0ab378546
  Stored in directory: c:\users\doas1\appdata\local\pip\cache\wheels\78\0f\d0\080f82ce0d7fdc771401b6acac304bd2ee77d67dee34737bd6
Successfully built pubchempy
Installing collected packages: pubchempy
Successfully installed pubchempy-1.0.4


In [9]:
import pubchempy as pcp

compound = pcp.get_compounds('glucose', 'name')[0]
print(compound.molecular_formula)
print(compound.molecular_weight)
print(compound.canonical_smiles)
print(compound.isomeric_smiles)
print(compound.inchi)
print(compound.inchikey)
print(compound.iupac_name)


C6H12O6
180.16
None
None
InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1
WQZGKKKJIJFFOK-GASJEMHNSA-N
(3R,4S,5S,6R)-6-(hydroxymethyl)oxane-2,3,4,5-tetrol


In [10]:
import pubchempy as pcp
import requests

compound = pcp.get_compounds('glucose', 'name')[0]

print(compound.molecular_formula)
print(compound.molecular_weight)

# PubChemPy에서 SMILES가 None이면, PubChem PUG REST로 보완
cid = compound.cid
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES,IsomericSMILES,SMILES,ConnectivitySMILES/JSON"
r = requests.get(url, timeout=30)
data = r.json()
p = data.get("PropertyTable", {}).get("Properties", [{}])[0]

smiles = p.get("IsomericSMILES") or p.get("CanonicalSMILES") or p.get("SMILES") or p.get("ConnectivitySMILES")
print(smiles)  # 여기서 SMILES가 나옴 

print(compound.inchi)
print(compound.inchikey)
print(compound.iupac_name)

C6H12O6
180.16
C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O
InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1
WQZGKKKJIJFFOK-GASJEMHNSA-N
(3R,4S,5S,6R)-6-(hydroxymethyl)oxane-2,3,4,5-tetrol


In [11]:
import pubchempy as pcp

# 원하는 화합물의 리스트 생성
compounds = ['glucose', 'ethanol', 'aspirin', 'caffeine', 'ibuprofen']

# 결과를 저장할 빈 사전 생성
results = {}

# 각 화합물에 대해 반복
for compound_name in compounds:
    # 화합물 검색
    compound = pcp.get_compounds(compound_name, 'name')

    # 검색 결과가 있으면
    if compound:
        # 첫 번째 결과를 사용
        compound = compound[0]

        # 필요한 정보 추출
        molecular_formula = compound.molecular_formula
        molecular_weight = compound.molecular_weight
        canonical_smiles = compound.canonical_smiles
        iupac_name = compound.iupac_name

        # 결과 저장
        results[compound_name] = {
            'Molecular Formula': molecular_formula,
            'Molecular Weight': molecular_weight,
            'Canonical SMILES': canonical_smiles,
            'IUPAC Name': iupac_name,
        }

# 결과 출력
for compound_name, properties in results.items():
    print(f'{compound_name}:')
    for property_name, value in properties.items():
        print(f'  {property_name}: {value}')
    print()



glucose:
  Molecular Formula: C6H12O6
  Molecular Weight: 180.16
  Canonical SMILES: None
  IUPAC Name: (3R,4S,5S,6R)-6-(hydroxymethyl)oxane-2,3,4,5-tetrol

ethanol:
  Molecular Formula: C2H6O
  Molecular Weight: 46.07
  Canonical SMILES: None
  IUPAC Name: ethanol

aspirin:
  Molecular Formula: C9H8O4
  Molecular Weight: 180.16
  Canonical SMILES: None
  IUPAC Name: 2-acetyloxybenzoic acid

caffeine:
  Molecular Formula: C8H10N4O2
  Molecular Weight: 194.19
  Canonical SMILES: None
  IUPAC Name: 1,3,7-trimethylpurine-2,6-dione

ibuprofen:
  Molecular Formula: C13H18O2
  Molecular Weight: 206.28
  Canonical SMILES: None
  IUPAC Name: 2-[4-(2-methylpropyl)phenyl]propanoic acid



In [12]:
import pubchempy as pcp
import requests

# 원하는 화합물 리스트
compounds = ['glucose', 'ethanol', 'aspirin', 'caffeine', 'ibuprofen']

results = {}

for compound_name in compounds:

    compound_list = pcp.get_compounds(compound_name, 'name')

    if compound_list:
        compound = compound_list[0]

        molecular_formula = compound.molecular_formula
        molecular_weight = compound.molecular_weight
        canonical_smiles = compound.canonical_smiles
        iupac_name = compound.iupac_name

        # canonical_smiles가 None이면 PUG REST로 보완
        if canonical_smiles is None:
            cid = compound.cid
            url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES,SMILES/JSON"
            r = requests.get(url, timeout=30)
            data = r.json()
            props = data.get("PropertyTable", {}).get("Properties", [{}])[0]
            canonical_smiles = props.get("CanonicalSMILES") or props.get("SMILES")

        results[compound_name] = {
            'Molecular Formula': molecular_formula,
            'Molecular Weight': molecular_weight,
            'Canonical SMILES': canonical_smiles,
            'IUPAC Name': iupac_name,
        }

# 결과 출력
for compound_name, properties in results.items():
    print(f'{compound_name}:')
    for property_name, value in properties.items():
        print(f'  {property_name}: {value}')
    print()

glucose:
  Molecular Formula: C6H12O6
  Molecular Weight: 180.16
  Canonical SMILES: C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O
  IUPAC Name: (3R,4S,5S,6R)-6-(hydroxymethyl)oxane-2,3,4,5-tetrol

ethanol:
  Molecular Formula: C2H6O
  Molecular Weight: 46.07
  Canonical SMILES: CCO
  IUPAC Name: ethanol

aspirin:
  Molecular Formula: C9H8O4
  Molecular Weight: 180.16
  Canonical SMILES: CC(=O)OC1=CC=CC=C1C(=O)O
  IUPAC Name: 2-acetyloxybenzoic acid

caffeine:
  Molecular Formula: C8H10N4O2
  Molecular Weight: 194.19
  Canonical SMILES: CN1C=NC2=C1C(=O)N(C(=O)N2C)C
  IUPAC Name: 1,3,7-trimethylpurine-2,6-dione

ibuprofen:
  Molecular Formula: C13H18O2
  Molecular Weight: 206.28
  Canonical SMILES: CC(C)CC1=CC=C(C=C1)C(C)C(=O)O
  IUPAC Name: 2-[4-(2-methylpropyl)phenyl]propanoic acid



In [15]:
import pubchempy as pcp
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import pandas as pd
import time
import requests

# CID로 SMILES를 PubChem PUG REST에서 가져오는 간단 함수 (PubChemPy가 None 줄 때 대비)
def get_smiles_by_cid(cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/IsomericSMILES,CanonicalSMILES,SMILES,ConnectivitySMILES/JSON"
    r = requests.get(url, timeout=30)
    if r.status_code != 200:
        return None
    data = r.json()
    props = data.get("PropertyTable", {}).get("Properties", [{}])[0]
    return (props.get("IsomericSMILES")
            or props.get("CanonicalSMILES")
            or props.get("SMILES")
            or props.get("ConnectivitySMILES"))

acetaminophen_cid = pcp.get_cids('acetaminophen', 'name')[0]  # Acetaminophen의 CID를 얻음
acetaminophen = pcp.Compound.from_cid(acetaminophen_cid)

# ✅ PubChemPy에서 None이면 requests(PUG REST)로 보완
acet_smiles = acetaminophen.isomeric_smiles or acetaminophen.canonical_smiles
if acet_smiles is None:
    acet_smiles = get_smiles_by_cid(acetaminophen_cid)

acetaminophen_mol = Chem.MolFromSmiles(acet_smiles)

similar_smiles = []

#  fp1은 루프 밖에서 1번만 계산 (원래는 매 cid마다 반복 계산 중이었음)
fp1 = AllChem.GetMorganFingerprint(acetaminophen_mol, 2)

# Note that we are now starting from a range of CIDs
for cid in range(acetaminophen_cid - 500, acetaminophen_cid + 500):
    time.sleep(0.1)  # Insert delay to prevent overloading the PubChem servers
    try:
        compound = pcp.Compound.from_cid(cid)

        # compound.isomeric_smiles가 None이면 보완
        smi = compound.isomeric_smiles or compound.canonical_smiles
        if smi is None:
            smi = get_smiles_by_cid(cid)
        if smi is None:
            continue

        mol = Chem.MolFromSmiles(smi)
        if mol is not None:
            fp2 = AllChem.GetMorganFingerprint(mol, 2)
            similarity = DataStructs.TanimotoSimilarity(fp1, fp2)

            if similarity >= 0.2:  # similarity 조절 (책에는 0.7)
                similar_smiles.append(smi)
                if len(similar_smiles) >= 10:  # 갯수 조절 최대 10 화합물만을 선택 (책에는 100개)
                    break
    except Exception as e:
        continue

pd.DataFrame(similar_smiles, columns=['SMILES']).to_csv('drug.csv', index=False)
print("저장 완료: drug.csv")



저장 완료: drug.csv




In [17]:
import requests
import time
from urllib.parse import quote
import pandas as pd
from rdkit.Chem import PandasTools

# 공통 헤더 (가끔 PubChem이 User-Agent 없으면 불안정할 때가 있음)
HEADERS = {"User-Agent": "Mozilla/5.0 (requests)"}

# PubChem에서 유사 화합물을 조회
def query_pubchem_for_similar_compounds(smiles, threshold=60, n_records=400, retries=3):
    escaped_smiles = quote(smiles).replace("/", ".")
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/similarity/smiles/{escaped_smiles}/JSON?Threshold={threshold}&MaxRecords={n_records}"

    # fallback: 네트워크/일시 오류면 몇 번 재시도
    last_err = None
    for _ in range(retries):
        try:
            r = requests.get(url, headers=HEADERS, timeout=30)
            r.raise_for_status()
            key = r.json()["Waiting"]["ListKey"]
            return key
        except Exception as e:
            last_err = e
            time.sleep(1)

    raise last_err


# 작업 상태를 확인하고 준비되면 결과 다운로드
def check_and_download(key, attempts=30):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/{key}/cids/JSON"
    print(f"작업 {key}을(를) URL {url}에서 조회 중...", end="")

    # fallback: attempts 동안 기다리되, 응답이 HTML/빈 값이면 계속 대기
    while attempts:
        r = requests.get(url, headers=HEADERS, timeout=30)
        r.raise_for_status()
        response = r.json()

        if "IdentifierList" in response and "CID" in response["IdentifierList"]:
            cids = response["IdentifierList"]["CID"]
            print(" 완료")
            return cids

        attempts -= 1
        print(".", end="")
        time.sleep(10)

    raise ValueError(f"작업 키: {key}에 대한 일치 항목을 찾을 수 없습니다.")


# PubChem CID로부터 SMILES 가져오기
def smiles_from_pubchem_cids(cids):
    # 먼저 CanonicalSMILES로 시도
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{','.join(map(str, cids))}/property/CanonicalSMILES/JSON"
    try:
        r = requests.get(url, headers=HEADERS, timeout=30)
        r.raise_for_status()
        props = r.json()["PropertyTable"]["Properties"]
        # CanonicalSMILES가 없는 항목이 있을 수 있으니 get으로 안전하게
        smiles_list = [item.get("CanonicalSMILES") for item in props]
    except Exception:
        smiles_list = [None] * len(cids)

    # fallback: CanonicalSMILES가 None인 것들은 다른 property로 한 번 더 요청
    if any(s is None for s in smiles_list):
        url2 = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{','.join(map(str, cids))}/property/IsomericSMILES,SMILES,ConnectivitySMILES/JSON"
        r2 = requests.get(url2, headers=HEADERS, timeout=30)
        r2.raise_for_status()
        props2 = r2.json().get("PropertyTable", {}).get("Properties", [])

        # CID별로 매핑해서 채워넣기 (원래 리스트 순서 유지)
        cid_to_smiles = {}
        for item in props2:
            cid = item.get("CID")
            smi = item.get("IsomericSMILES") or item.get("SMILES") or item.get("ConnectivitySMILES")
            if cid is not None:
                cid_to_smiles[cid] = smi

        smiles_list = [
            (s if s is not None else cid_to_smiles.get(cid))
            for s, cid in zip(smiles_list, cids)
        ]

    return smiles_list


# 아세트아미노펜 SMILES 문자열
acetaminophen_smiles = 'CC(=O)NC1=CC=C(C=C1)O'

# PubChem에서 유사 화합물 조회
job_key = query_pubchem_for_similar_compounds(acetaminophen_smiles, threshold=90, n_records=100)

# 작업 상태를 확인하고 준비되면 결과 다운로드
similar_cids = check_and_download(job_key)

# PubChem CID로부터 SMILES 가져오기
similar_smiles = smiles_from_pubchem_cids(similar_cids)

# 결과를 DataFrame으로 저장
similar_compounds_df = pd.DataFrame({"smiles": similar_smiles, "CIDs": similar_cids})

# fallback: smiles가 None인 행은 RDKit 변환에서 문제될 수 있어 제거(최소 방어)
similar_compounds_df = similar_compounds_df.dropna(subset=["smiles"]).reset_index(drop=True)

PandasTools.AddMoleculeColumnToFrame(similar_compounds_df, smilesCol="smiles")

# DataFrame을 CSV 파일로 저장
similar_compounds_df.to_csv("drug1.csv", index=False)
print("저장 완료: drug1.csv")

작업 3402564704988971929을(를) URL https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/3402564704988971929/cids/JSON에서 조회 중.... 완료
저장 완료: drug1.csv


In [19]:
import requests
import time
from urllib.parse import quote
import pandas as pd
from rdkit.Chem import PandasTools, Lipinski, MolFromSmiles

class PubChemSimilaritySearcher:

    def __init__(self, smiles, threshold=60, n_records=400, attempts=30):
        # 기준이 되는 SMILES 문자열
        self.smiles = smiles
        
        # PubChem similarity search에서 사용할 유사도 임계값 (0~100)
        self.threshold = threshold
        
        # 최대 검색 결과 개수
        self.n_records = n_records
        
        # 비동기 작업(listkey) 대기 시 최대 반복 횟수
        self.attempts = attempts

    def query(self):
        # SMILES 문자열을 URL 인코딩 (특수문자 안전 처리)
        escaped_smiles = quote(self.smiles).replace("/", ".")
        
        # PubChem similarity search 요청 URL 생성
        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/similarity/smiles/\
        {escaped_smiles}/JSON?Threshold={self.threshold}&MaxRecords={self.n_records}"
        
        # 요청 실행
        r = requests.get(url)
        r.raise_for_status()  # HTTP 에러 발생 시 예외 발생
        
        # PubChem은 비동기 처리 → ListKey 반환
        return r.json()["Waiting"]["ListKey"]

    def check_and_download(self, key):
        # ListKey를 사용하여 결과 CID 목록 요청
        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/{key}/cids/JSON"
        print(f"Querying for job {key} at URL {url}...", end="")
        
        # PubChem이 결과를 준비할 때까지 반복 확인
        while self.attempts:
            r = requests.get(url)
            r.raise_for_status()
            response = r.json()
            
            # 결과가 준비되면 CID 리스트 반환
            if "IdentifierList" in response:
                print(" 완료")
                return response["IdentifierList"]["CID"]
            
            # 아직 준비되지 않았다면 대기
            self.attempts -= 1
            print(".", end="")
            time.sleep(10)  # 10초 대기 후 재시도
        
        # 반복 횟수 초과 시 오류 발생
        raise ValueError(f"Could not find matches for job key: {key}")

    def smiles_from_pubchem_cids(self, cids):
        # PubChem CID로부터 SMILES 가져오기
        
        # 1) 먼저 CanonicalSMILES 요청
        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{','.join(map(str, cids))}/property/CanonicalSMILES/JSON"
        r = requests.get(url)
        r.raise_for_status()

        # JSON에서 property 추출
        props = r.json().get("PropertyTable", {}).get("Properties", [])
        
        # CanonicalSMILES 추출 (없으면 None)
        smiles_list = [item.get("CanonicalSMILES") for item in props]

        # 2) fallback 처리
        # CanonicalSMILES가 없는 경우 다른 SMILES 종류로 다시 요청
        if any(s is None for s in smiles_list):
            
            url2 = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{','.join(map(str, cids))}/property/IsomericSMILES,SMILES,ConnectivitySMILES/JSON"
            r2 = requests.get(url2)
            r2.raise_for_status()
            
            props2 = r2.json().get("PropertyTable", {}).get("Properties", [])

            # CID별로 SMILES 매핑 생성
            cid_to_smiles = {}
            for item in props2:
                cid = item.get("CID")
                # Isomeric → SMILES → Connectivity 순으로 fallback
                smi = item.get("IsomericSMILES") or item.get("SMILES") or item.get("ConnectivitySMILES")
                if cid is not None:
                    cid_to_smiles[cid] = smi

            # 원래 CID 순서에 맞게 SMILES 리스트 재구성
            smiles_list = [cid_to_smiles.get(cid) for cid in cids]

        return smiles_list

    def pass_lipinski(self, smiles):
        # Lipinski Rule of Five 적용
        mol = MolFromSmiles(smiles)
        if mol:
            return (
                Lipinski.NumHDonors(mol) <= 5 and
                Lipinski.NumHAcceptors(mol) <= 10 and
                Lipinski.rdMolDescriptors.CalcExactMolWt(mol) <= 500 and
                Lipinski.rdMolDescriptors.CalcCrippenDescriptors(mol)[0] <= 5
            )
        return False

    def search_similar_compounds(self):
        # 1) similarity search 요청 → job key 획득
        job_key = self.query()
        
        # 2) job key로 CID 목록 획득
        similar_cids = self.check_and_download(job_key)
        
        # 3) CID 목록을 SMILES 목록으로 변환
        similar_smiles = self.smiles_from_pubchem_cids(similar_cids)
        
        # 4) DataFrame 생성
        similar_compounds_df = pd.DataFrame({"smiles": similar_smiles, "CIDs": similar_cids})

        # SMILES가 None인 경우 제거 (RDKit 오류 방지)
        similar_compounds_df = similar_compounds_df.dropna(subset=["smiles"]).reset_index(drop=True)

        # 5) Lipinski Rule 통과 화합물만 필터링
        similar_compounds_df = similar_compounds_df[
            similar_compounds_df["smiles"].apply(self.pass_lipinski)
        ]

        # 6) RDKit 분자 객체 컬럼 추가 (시각화용)
        PandasTools.AddMoleculeColumnToFrame(similar_compounds_df, smilesCol="smiles")

        return similar_compounds_df


# 아세트아미노펜 SMILES 문자열
acetaminophen_smiles = 'CC(=O)NC1=CC=C(C=C1)O'

# 아세트아미노펜과 유사한 화합물에 대한 sercher 생성성
searcher = PubChemSimilaritySearcher(
    acetaminophen_smiles,
    threshold=90,   # 유사도 90% 이상
    n_records=20    # 최대 20개 검색
)

# 유사 화합물 검색 실행
similar_compounds_df = searcher.search_similar_compounds()

# DataFrame을 CSV 파일로 저장
similar_compounds_df.to_csv("drug2.csv", index=False)
print("저장 완료: drug2.csv")

Querying for job 2900230659270621711 at URL https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/2900230659270621711/cids/JSON.... 완료
저장 완료: drug2.csv


In [21]:
import requests
import time
from urllib.parse import quote
import pandas as pd
from rdkit.Chem import PandasTools

# PubChem에서 유사 화합물 조회
def query_pubchem_for_similar_compounds(smiles, threshold=60, n_records=400):
    escaped_smiles = quote(smiles).replace("/", ".")
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/similarity/smiles/{escaped_smiles}/JSON?Threshold={threshold}&MaxRecords={n_records}"
    r = requests.get(url)
    r.raise_for_status()
    key = r.json()["Waiting"]["ListKey"]
    return key

# 작업 상태를 확인하고 준비되면 결과 다운로드
def check_and_download(key, attempts=30):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/{key}/cids/JSON"
    print(f"작업 {key}을(를) URL {url}에서 조회 중...", end="")
    while attempts:
        r = requests.get(url)
        r.raise_for_status()
        response = r.json()
        if "IdentifierList" in response:
            cids = response["IdentifierList"]["CID"]
            break
        attempts -= 1
        print(".", end="")
        time.sleep(10)
    else:
        raise ValueError(f"작업 키: {key}에 대한 일치 항목을 찾을 수 없습니다.")
    return cids

# PubChem CID로부터 SMILES 가져오기 (fallback 적용)
def smiles_from_pubchem_cids(cids):
    # 1️⃣ 먼저 CanonicalSMILES 요청
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{','.join(map(str, cids))}/property/CanonicalSMILES/JSON"
    r = requests.get(url)
    r.raise_for_status()

    props = r.json().get("PropertyTable", {}).get("Properties", [])
    smiles_list = [item.get("CanonicalSMILES") for item in props]

    # 2️⃣ fallback: CanonicalSMILES가 없는 경우 다른 SMILES 종류로 재요청
    if any(s is None for s in smiles_list):
        url2 = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{','.join(map(str, cids))}/property/IsomericSMILES,SMILES,ConnectivitySMILES/JSON"
        r2 = requests.get(url2)
        r2.raise_for_status()
        props2 = r2.json().get("PropertyTable", {}).get("Properties", [])

        # CID별 SMILES 매핑 생성
        cid_to_smiles = {}
        for item in props2:
            cid = item.get("CID")
            smi = item.get("IsomericSMILES") or item.get("SMILES") or item.get("ConnectivitySMILES")
            if cid is not None:
                cid_to_smiles[cid] = smi

        # 원래 CID 순서대로 SMILES 재구성
        smiles_list = [cid_to_smiles.get(cid) for cid in cids]

    return smiles_list

# 아세트아미노펜 SMILES 문자열
acetaminophen_smiles = 'CC(=O)NC1=CC=C(C=C1)O'

# PubChem에서 유사 화합물 조회
job_key = query_pubchem_for_similar_compounds(acetaminophen_smiles, threshold=90, n_records=10)

# 작업 상태를 확인하고 준비되면 결과 다운로드
similar_cids = check_and_download(job_key)

# PubChem CID로부터 SMILES 가져오기
similar_smiles = smiles_from_pubchem_cids(similar_cids)

# 결과를 DataFrame으로 저장
similar_compounds_df = pd.DataFrame({"smiles": similar_smiles, "CIDs": similar_cids})

# SMILES가 None인 경우 제거 (RDKit 오류 방지용 최소 방어)
similar_compounds_df = similar_compounds_df.dropna(subset=["smiles"]).reset_index(drop=True)

PandasTools.AddMoleculeColumnToFrame(similar_compounds_df, smilesCol="smiles")

# DataFrame을 CSV 파일로 저장
similar_compounds_df.to_csv("drug1.csv", index=False)
print("저장 완료: drug1.csv")

작업 3845458013913523792을(를) URL https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/3845458013913523792/cids/JSON에서 조회 중....저장 완료: drug1.csv
