# Expanded Chemical Space Exploration

Here we analyse the chemical space associated with the CHEESE similarity search results.

In [19]:
import pandas as pd
from tqdm import tqdm
import numpy as np

df = pd.read_csv("../results/cheese_search.csv")

databases = ["zinc15", "enamine-real"]
categories = ["natural", "synthetic"]
search_types = ["consensus", "morgan", "espsim_shape", "espsim_electrostatic"]

In [24]:
# Get aggregate statistics

R = []
for database in databases:
    for category in categories:
        for search_type in search_types:
            df_ = df[df["database"] == database]
            df_ = df_[df_["query_category"] == category]
            df_ = df_[df_["search_type"] == search_type]
            r = [
                database,
                category,
                search_type,
                df_.shape[0],
                df_["score"].mean(),
                df_["score"].std(),
                df_["score"].median(),
                np.percentile(df_["score"], 25),
                np.percentile(df_["score"], 75),
                df_["score"].min(),
                df_["score"].max(),
            ]
            R.append(r)

dr = pd.DataFrame(
    R,
    columns=[
        "database",
        "category",
        "search_type",
        "counts",
        "mean",
        "std",
        "median",
        "perc_25",
        "perc_75",
        "min",
        "max",
    ],
)

dr.to_csv("../results/cheese_search_aggregate.csv", index=False)