In [14]:
import pandas as pd
df = pd.read_csv('drug_names.csv')

In [15]:
import requests, time
# Function to fetch canonical SMILES
def get_canonical_smiles(cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES/TXT"
    r = requests.get(url)
    if r.status_code == 200:
        return r.text.strip()
    else:
        return None

smiles = []
for cid in df["pubchem_id"]:
    smiles.append(get_canonical_smiles(cid))
    time.sleep(0.2)

df["canonical_smiles"] = smiles

In [16]:
df[pd.isna(df["canonical_smiles"])]

Unnamed: 0,pubchem_id,drug_name,canonical_smiles


In [17]:
df.to_csv("drug_smiles.csv", index=False)

In [18]:
#create fingerprints
import os
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

# df has columns: pubchem_id, canonical_smiles
drug_smiles = pd.read_csv("drug_smiles.csv")

df = drug_smiles.copy()

out_dir = "drug_fingerprints"
os.makedirs(out_dir, exist_ok=True)

def morgan_bits(smiles, n_bits, radius=2):
    m = Chem.MolFromSmiles(smiles)
    if m is None:
        return np.full(n_bits, np.nan)  # keep shape even if invalid
    fp = AllChem.GetMorganFingerprintAsBitVect(m, radius, nBits=n_bits)
    arr = np.zeros((n_bits,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

sizes = [64, 128, 256, 512, 1024, 2048]

for n in sizes:
    print(f"computing {n}-bit fingerprints")
    # compute and stack into a DataFrame (rows = bit positions, cols = pubchem_id)
    fp_matrix = {}
    for pid, smi in zip(df["pubchem_id"], df["canonical_smiles"]):
        fp_matrix[pid] = morgan_bits(smi, n)
    fp_df = pd.DataFrame(fp_matrix)  # index 0..n-1, columns = pubchem_id
    fp_df.to_csv(f"{out_dir}/pubchem_id_to_demorgan_{n}_map.csv", index=False)


computing 64-bit fingerprints
computing 128-bit fingerprints
computing 256-bit fingerprints




computing 512-bit fingerprints
computing 1024-bit fingerprints
computing 2048-bit fingerprints
