### 주의 : 본 코드는 책에 대한 학습 및 교육외에 배포를 금지합니다.
### Warning: This code is prohibited from distribution except for learning and educational purposes related to the book.
2장 화학 정보의 탐색: Chemical Space와 빅데이터
- by Keunhong Jeong

![image.png](attachment:image.png)

In [None]:
!pip install rdkit==2023.03.01

In [None]:
import requests

# PubChem에서 화합물 페이지에 접근
url = "https://pubchem.ncbi.nlm.nih.gov/compound/12345"
response = requests.get(url)

# HTML 데이터를 파일에 저장
with open("pubchem_page.txt", "w", encoding="utf-8") as file:
    file.write(response.text)


In [None]:
import requests
from bs4 import BeautifulSoup

# PubChem에서 화합물 페이지에 접근
url = "https://pubchem.ncbi.nlm.nih.gov/compound/12345"
response = requests.get(url)

# BeautifulSoup 객체 생성
soup = BeautifulSoup(response.text, 'html.parser')

# 'Molecular Weight' 텍스트가 있는 HTML 요소 찾기
mw_label = soup.find('th', text='Molecular Weight')

if mw_label:
    # 'Molecular Weight' 텍스트 다음에 나오는 숫자 찾기
    mw_value = mw_label.find_next_sibling('td')
    if mw_value:
        print('Molecular Weight:', mw_value.text.strip())
    else:
        print('Molecular Weight 값을 찾을 수 없습니다.')
else:
    print('Molecular Weight 정보를 찾을 수 없습니다.')


In [None]:
import requests
import json

compound_name = "glucose"

# PubChem의 REST API를 사용하여 JSON 형식으로 데이터 요청
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound_name}/property/MolecularFormula,MolecularWeight,CanonicalSMILES,InChI,InChIKey/JSON"
response = requests.get(url)

# JSON 데이터를 Python 사전으로 변환
data = json.loads(response.text)

# 데이터에서 화합물의 정보를 추출
properties = data['PropertyTable']['Properties'][0]

# 화합물의 정보 출력
print('분자식:', properties['MolecularFormula'])
print('분자량:', properties['MolecularWeight'])
print('Canonical SMILES:', properties['CanonicalSMILES'])
print('InChI:', properties['InChI'])
print('InChIKey:', properties['InChIKey'])



In [None]:
import requests
import json

smiles = "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O"

# PubChem의 REST API를 사용하여 JSON 형식으로 데이터 요청
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles}/property/MolecularFormula,MolecularWeight,CanonicalSMILES,InChI,InChIKey/JSON"
response = requests.get(url)

# JSON 데이터를 Python 사전으로 변환
data = json.loads(response.text)

# 데이터에서 화합물의 정보를 추출
properties = data['PropertyTable']['Properties'][0]

# 화합물의 정보 출력
print('분자식:', properties['MolecularFormula'])
print('분자량:', properties['MolecularWeight'])
print('Canonical SMILES:', properties['CanonicalSMILES'])
print('InChI:', properties['InChI'])
print('InChIKey:', properties['InChIKey'])


In [None]:
import requests
import json

compound_name = "ethanol"

# PubChem의 REST API를 사용하여 JSON 형식으로 데이터 요청
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound_name}/property/MolecularFormula,MolecularWeight,CanonicalSMILES,InChI,InChIKey/JSON"
response = requests.get(url)

# JSON 데이터를 Python 사전으로 변환
data = json.loads(response.text)

# 데이터에서 화합물의 정보를 추출
properties = data['PropertyTable']['Properties'][0]

# 화합물의 정보 출력
print('분자식:', properties['MolecularFormula'])
print('분자량:', properties['MolecularWeight'])
print('Canonical SMILES:', properties['CanonicalSMILES'])
print('InChI:', properties['InChI'])
print('InChIKey:', properties['InChIKey'])

In [None]:
!pip install pubchempy==1.0.4

In [None]:
import pubchempy as pcp

compound = pcp.get_compounds('glucose', 'name')[0]
print(compound.molecular_formula)
print(compound.molecular_weight)
print(compound.canonical_smiles)
print(compound.isomeric_smiles)
print(compound.inchi)
print(compound.inchikey)
print(compound.iupac_name)


In [None]:
import pubchempy as pcp

# 원하는 화합물의 리스트 생성
compounds = ['glucose', 'ethanol', 'aspirin', 'caffeine', 'ibuprofen']

# 결과를 저장할 빈 사전 생성
results = {}

# 각 화합물에 대해 반복
for compound_name in compounds:
    # 화합물 검색
    compound = pcp.get_compounds(compound_name, 'name')

    # 검색 결과가 있으면
    if compound:
        # 첫 번째 결과를 사용
        compound = compound[0]

        # 필요한 정보 추출
        molecular_formula = compound.molecular_formula
        molecular_weight = compound.molecular_weight
        canonical_smiles = compound.canonical_smiles
        iupac_name = compound.iupac_name

        # 결과 저장
        results[compound_name] = {
            'Molecular Formula': molecular_formula,
            'Molecular Weight': molecular_weight,
            'Canonical SMILES': canonical_smiles,
            'IUPAC Name': iupac_name,
        }

# 결과 출력
for compound_name, properties in results.items():
    print(f'{compound_name}:')
    for property_name, value in properties.items():
        print(f'  {property_name}: {value}')
    print()



In [None]:
import pubchempy as pcp
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import pandas as pd
import time

acetaminophen_cid = pcp.get_cids('acetaminophen', 'name')[0]  # Acetaminophen의 CID를 얻음
acetaminophen = pcp.Compound.from_cid(acetaminophen_cid)
acetaminophen_mol = Chem.MolFromSmiles(acetaminophen.isomeric_smiles)

similar_smiles = []

# Note that we are now starting from a range of CIDs
for cid in range(acetaminophen_cid - 5000, acetaminophen_cid + 5000):
    time.sleep(0.1)  # Insert delay to prevent overloading the PubChem servers
    try:
        compound = pcp.Compound.from_cid(cid)
        mol = Chem.MolFromSmiles(compound.isomeric_smiles)
        if mol is not None:
            fp1 = AllChem.GetMorganFingerprint(acetaminophen_mol, 2)
            fp2 = AllChem.GetMorganFingerprint(mol, 2)
            similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
            if similarity >= 0.2: # similarity 조절 (책에는 0.7)
                similar_smiles.append(compound.isomeric_smiles)
                if len(similar_smiles) >= 10:  # 갯수 조절 최대 10 화합물만을 선택 (책에는 100개)
                    break
    except Exception as e:
        continue

pd.DataFrame(similar_smiles, columns=['SMILES']).to_csv('drug.csv', index=False)

In [None]:
import requests
import time
from urllib.parse import quote
import pandas as pd
from rdkit.Chem import PandasTools

# PubChem에서 유사 화합물을 조회
def query_pubchem_for_similar_compounds(smiles, threshold=60, n_records=400):
    escaped_smiles = quote(smiles).replace("/", ".")
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/similarity/smiles/{escaped_smiles}/JSON?Threshold={threshold}&MaxRecords={n_records}"
    r = requests.get(url)
    r.raise_for_status()
    key = r.json()["Waiting"]["ListKey"]
    return key

# 작업 상태를 확인하고 준비되면 결과 다운로드
def check_and_download(key, attempts=30):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/{key}/cids/JSON"
    print(f"작업 {key}을(를) URL {url}에서 조회 중...", end="")
    while attempts:
        r = requests.get(url)
        r.raise_for_status()
        response = r.json()
        if "IdentifierList" in response:
            cids = response["IdentifierList"]["CID"]
            break
        attempts -= 1
        print(".", end="")
        time.sleep(10)
    else:
        raise ValueError(f"작업 키: {key}에 대한 일치 항목을 찾을 수 없습니다.")
    return cids

# PubChem CID로부터 SMILES 가져오기
def smiles_from_pubchem_cids(cids):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{','.join(map(str, cids))}/property/CanonicalSMILES/JSON"
    r = requests.get(url)
    r.raise_for_status()
    return [item["CanonicalSMILES"] for item in r.json()["PropertyTable"]["Properties"]]

# 아세트아미노펜 SMILES 문자열
acetaminophen_smiles = 'CC(=O)NC1=CC=C(C=C1)O'

# PubChem에서 유사 화합물 조회
job_key = query_pubchem_for_similar_compounds(acetaminophen_smiles, threshold=90, n_records=100)

# 작업 상태를 확인하고 준비되면 결과 다운로드
similar_cids = check_and_download(job_key)

# PubChem CID로부터 SMILES 가져오기
similar_smiles = smiles_from_pubchem_cids(similar_cids)

# 결과를 DataFrame으로 저장
similar_compounds_df = pd.DataFrame({"smiles": similar_smiles, "CIDs": similar_cids})
PandasTools.AddMoleculeColumnToFrame(similar_compounds_df, smilesCol="smiles")

# DataFrame을 CSV 파일로 저장
similar_compounds_df.to_csv("drug1.csv", index=False)

In [None]:
import requests
import time
from urllib.parse import quote
import pandas as pd
from rdkit.Chem import PandasTools, Lipinski, MolFromSmiles

class PubChemSimilaritySearcher:

    def __init__(self, smiles, threshold=60, n_records=400, attempts=30):
        self.smiles = smiles
        self.threshold = threshold
        self.n_records = n_records
        self.attempts = attempts

    def query(self):
        escaped_smiles = quote(self.smiles).replace("/", ".")
        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/similarity/smiles/\
        {escaped_smiles}/JSON?Threshold={self.threshold}&MaxRecords={self.n_records}"
        r = requests.get(url)
        r.raise_for_status()
        return r.json()["Waiting"]["ListKey"]

    def check_and_download(self, key):
        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/{key}/cids/JSON"
        print(f"Querying for job {key} at URL {url}...", end="")
        while self.attempts:
            r = requests.get(url)
            r.raise_for_status()
            response = r.json()
            if "IdentifierList" in response:
                return response["IdentifierList"]["CID"]
            self.attempts -= 1
            print(".", end="")
            time.sleep(10)
        raise ValueError(f"Could not find matches for job key: {key}")

    def smiles_from_pubchem_cids(self, cids):
        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{','.join(map(str, cids))}/property/CanonicalSMILES/JSON"
        r = requests.get(url)
        r.raise_for_status()
        return [item["CanonicalSMILES"] for item in r.json()["PropertyTable"]["Properties"]]

    def pass_lipinski(self, smiles):
        mol = MolFromSmiles(smiles)
        if mol:
            return Lipinski.NumHDonors(mol) <= 5 and Lipinski.NumHAcceptors(mol) <= 10 and Lipinski.rdMolDescriptors.CalcExactMolWt(mol) <= 500 and Lipinski.rdMolDescriptors.CalcCrippenDescriptors(mol)[0] <= 5
        return False

    def search_similar_compounds(self):
        job_key = self.query()
        similar_cids = self.check_and_download(job_key)
        similar_smiles = self.smiles_from_pubchem_cids(similar_cids)
        similar_compounds_df = pd.DataFrame({"smiles": similar_smiles, "CIDs": similar_cids})
        similar_compounds_df = similar_compounds_df[similar_compounds_df["smiles"].apply(self.pass_lipinski)]
        PandasTools.AddMoleculeColumnToFrame(similar_compounds_df, smilesCol="smiles")
        return similar_compounds_df

# 아세트아미노펜 SMILES 문자열
acetaminophen_smiles = 'CC(=O)NC1=CC=C(C=C1)O'

# 아세트아미노펜과 유사한 화합물에 대한 sercher 생성성
searcher = PubChemSimilaritySearcher(acetaminophen_smiles, threshold=90, n_records=20)

# 유사 화합물 검색
similar_compounds_df = searcher.search_similar_compounds()

# DataFrame을 CSV 파일로 저장
similar_compounds_df.to_csv("drug2.csv", index=False)


In [None]:
import requests
import time
from urllib.parse import quote
import pandas as pd
from rdkit.Chem import PandasTools

# PubChem에서 유사 화합물 조회
def query_pubchem_for_similar_compounds(smiles, threshold=60, n_records=400):
    escaped_smiles = quote(smiles).replace("/", ".")
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/similarity/smiles/{escaped_smiles}/JSON?Threshold={threshold}&MaxRecords={n_records}"
    r = requests.get(url)
    r.raise_for_status()
    key = r.json()["Waiting"]["ListKey"]
    return key

# 작업 상태를 확인하고 준비되면 결과 다운로드
def check_and_download(key, attempts=30):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/{key}/cids/JSON"
    print(f"작업 {key}을(를) URL {url}에서 조회 중...", end="")
    while attempts:
        r = requests.get(url)
        r.raise_for_status()
        response = r.json()
        if "IdentifierList" in response:
            cids = response["IdentifierList"]["CID"]
            break
        attempts -= 1
        print(".", end="")
        time.sleep(10)
    else:
        raise ValueError(f"작업 키: {key}에 대한 일치 항목을 찾을 수 없습니다.")
    return cids

# PubChem CID로부터 SMILES 가져오기
def smiles_from_pubchem_cids(cids):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{','.join(map(str, cids))}/property/CanonicalSMILES/JSON"
    r = requests.get(url)
    r.raise_for_status()
    return [item["CanonicalSMILES"] for item in r.json()["PropertyTable"]["Properties"]]

# 아세트아미노펜 SMILES 문자열
acetaminophen_smiles = 'CC(=O)NC1=CC=C(C=C1)O'

# PubChem에서 유사 화합물 조회
job_key = query_pubchem_for_similar_compounds(acetaminophen_smiles, threshold=90, n_records=10)

# 작업 상태를 확인하고 준비되면 결과 다운로드
similar_cids = check_and_download(job_key)

# PubChem CID로부터 SMILES 가져오기
similar_smiles = smiles_from_pubchem_cids(similar_cids)

# 결과를 DataFrame으로 저장
similar_compounds_df = pd.DataFrame({"smiles": similar_smiles, "CIDs": similar_cids})
PandasTools.AddMoleculeColumnToFrame(similar_compounds_df, smilesCol="smiles")

# DataFrame을 CSV 파일로 저장
similar_compounds_df.to_csv("drug1.csv", index=False)