In [50]:
import requests
import json
from tqdm import tqdm

In [51]:
def get_cid(name: str) -> str:
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/cids/json"
    response = requests.get(url)
    data = response.json()
    if "IdentifierList" in data and "CID" in data["IdentifierList"] and len(data["IdentifierList"]["CID"]) > 0:
        return data["IdentifierList"]["CID"][0]
    else:
        return ""

def get_smiles(cid: str) -> str:
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES/json"
    response = requests.get(url)
    data = response.json()
    if "PropertyTable" in data and "Properties" in data["PropertyTable"] and len(data["PropertyTable"]["Properties"]) > 0:
        return data["PropertyTable"]["Properties"][0]["CanonicalSMILES"]
    else:
        return ""
    
def search_similar(smiles: str, threshold: float = 90) -> list:
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastsimilarity_2d/smiles/{smiles}/cids/JSON?Threshold={threshold}&MaxRecords=100"
    print (url)
    response = requests.get(url)
    data = response.json()
    if "IdentifierList" in data and "CID" in data["IdentifierList"]:
        return data["IdentifierList"]["CID"]
    else:
        return []
    
def get_kegg_id(name: str) -> str:
    url = f"https://rest.kegg.jp/find/compound/{name}"
    response = requests.get(url)
    return response.text.split("\t")[0].strip()


In [52]:
cid = get_cid("Glimepiride")

smiles = get_smiles(cid)

similar_structures = search_similar(smiles, 98)

print (similar_structures)



https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastsimilarity_2d/smiles/CCC1=C(CN(C1=O)C(=O)NCCC2=CC=C(C=C2)S(=O)(=O)NC(=O)NC3CCC(CC3)C)C/cids/JSON?Threshold=98&MaxRecords=100
[3476, 25235580, 144291124, 142933950, 138524604, 124198231, 118689036, 70809341, 68091311, 68091306, 60040047, 56633356, 56620977, 56618785, 56614415, 25207715, 24744542, 25235584, 25235629, 25235716, 25235759, 25235762, 25235895, 25241307, 117588234]


In [53]:
get_kegg_id("Glimepiride")

'cpd:C07669'

In [54]:
output_folder = "neo4j"

In [55]:
content = ""

with open('drug_details_4.jsonl', 'r') as f:
    for line in tqdm(f.readlines()):
        drug = json.loads(line)
        name = drug["name"]

        drug["cid"] = get_cid(name)
        drug["SMILES"] =  get_smiles(cid)

        drug["kegg_id"] = get_kegg_id(name)

        content += json.dumps(drug) + "\n"

with open('neo4j/drug_details_5.jsonl', 'w') as f:
    f.write(content)
       

  0%|          | 0/499 [00:00<?, ?it/s]

100%|██████████| 499/499 [23:36<00:00,  2.84s/it]
