https://cheese-docs.deepmedchem.com/getting_started-api/

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
import time
from rdkit import Chem

load_dotenv("../.env")
CHEESE_API_KEY = os.getenv("CHEESE_API_KEY")


In [None]:
import requests


def _query_molecule(smiles, search_type, search_quality, n_neighbors):
    inchikey = Chem.MolToInchiKey(Chem.MolFromSmiles(smiles))
    api_key = CHEESE_API_KEY
    data = requests.get("https://api.cheese.themama.ai/molsearch",
                        {"search_input": smiles,
                        "search_type": search_type,
                        "n_neighbors": n_neighbors,
                        "search_quality": search_quality,
                        "descriptors": False,
                        "properties": False,
                        "filter_molecules": True},
                        headers={'Authorization': f"Bearer {api_key}"},
                        verify=False).json()
    R = []
    for r in data["neighbors"]:
        identifier = r["zinc_id"]
        if identifier.startswith("ZINC"):
            database = "zinc15"
        elif identifier.startswith("ENAMINE"):
            database = "enamine-real"
        else:
            database = None
        R += [[smiles, inchikey, r["smiles"], identifier, search_type, r["Morgan Tanimoto"], database]]
    df = pd.DataFrame(R, columns=["query_smiles", "query_inchikey", "smiles", "identifier", "search_type", "score", "database"]).sort_values("score", ascending=False).reset_index(drop=True)
    return df


def query_molecule(smiles, search_type="consensus", search_quality="very accurate", n_neighbors=100):
    for _ in range(10):
        try:
            df = _query_molecule(smiles, search_type=search_type, search_quality=search_quality, n_neighbors=n_neighbors)
            if df.shape[0] == n_neighbors:
                return df
        except:
            print("Error, retrying in 5 seconds")
            time.sleep(5)


def query_molecule_all_similarities(smiles, search_quality="very accurate", n_neighbors=100):
    search_types = ["consensus", "morgan", "espsim_electrostatic", "espsim_shape"]
    dfs = []
    for search_type in search_types:
        print(smiles, search_type)
        dfs += [query_molecule(smiles, search_type=search_type, search_quality=search_quality, n_neighbors=n_neighbors)]
    df = pd.concat(dfs).reset_index(drop=True)
    return df


def run(smiles):
    inchikey = Chem.MolToInchiKey(Chem.MolFromSmiles(smiles))
    file_name = os.path.join("..", "results", "cheese", f"{inchikey}.csv")
    if os.path.exists(file_name):
        print("Already done for", inchikey)
        return
    df = query_molecule_all_similarities(smiles)
    df.to_csv(file_name, index=False)

In [None]:
smiles_list = pd.read_csv("../data/all_molecules.csv")["smiles"].tolist()

In [None]:
for smiles in smiles_list:
    run(smiles)

In [13]:
## REMOVE CHEESE QUERIES FOR SARSCOV1 MOLECULES
# compare old vs new all molecules and keep the inchikeys of the ones only in old or only in new
import pandas as pd

df1 = pd.read_csv("../data/all_molecules_old.csv")
df2 = pd.read_csv("../data/all_molecules.csv")

# Convert inchikey columns to sets
inchikey_set1 = set(df1['inchikey'])
inchikey_set2 = set(df2['inchikey'])

# Find values present only in df1
only_in_df1 = list(inchikey_set1 - inchikey_set2)

# Find values present only in df2
only_in_df2 = list(inchikey_set2 - inchikey_set1)

# Print the results
print("Values present only in df1:", len(only_in_df1))
print("Values present only in df2:", len(only_in_df2))

Values present only in df1: 0
Values present only in df2: 0
