In [1]:
import pandas as pd
import os

benchmarks = {
    "tdcommons/bioavailability-ma":0.729,
    "tdcommons/hia-hou":0.989,
    "tdcommons/pgp-broccatelli":0.954,
    "tdcommons/bbb-martins":0.933,
    "tdcommons/cyp2c9-veith":0.895,
    "tdcommons/cyp2d6-veith":0.801,
    "tdcommons/cyp3a4-veith":0.919,
    "tdcommons/cyp2c9-substrate-carbonmangels":0.478,
    "tdcommons/cyp2d6-substrate-carbonmangels":0.719,
    "tdcommons/cyp3a4-substrate-carbonmangels":0.696,
    "tdcommons/herg":0.891,
    "tdcommons/ames":0.868,
    "tdcommons/dili":0.933
}




# LazyQSAR v2

Compare the inclusion of RDKIT or not as a base descriptor. The possible options are:

**No RDKIT** (v1):
- Fast: morgan
- Default: chemeleon
- Slow: morgan + chemeleon

**RDKIT** (v2):
- Fast: morgan + rdkit
- Default: chemeleon + rdkit
- Slow: morgan + chemeleon +rdkit

**Only RDKIT** (for testing purposes)
- Fast: rdkit

In [None]:
base_dir = os.path.join("..", "results", "lazyqsar_v2")

configs = {
    "rdkit": ["fast", "default", "slow"],
    "no_rdkit": ["fast", "default", "slow"],
    "only_rdkit": ["fast"],
}

dfs = []

scores = {}
headers = []

for k,v in benchmarks.items():
    scores[k] = [v]
headers += ["polaris"]

for config, modes in configs.items():
    for mode in modes:
        path = os.path.join(base_dir, f"lazyqsar_{config}", mode, "summary_scores.csv")
        if os.path.exists(path):
            headers += [f"{config}_{mode}"]
            df = pd.read_csv(path)
            df.set_index("benchmark", inplace=True)
            print(df.head())
            for k in scores.keys():
                if k in df.index:
                    scores[k].append(df.loc[k, "score"])
                else:
                    scores[k].append(float("nan"))

df = pd.DataFrame.from_dict(scores, orient="index", columns=headers)
df.index.name = "benchmark"
df.reset_index(inplace=True)

out_path = os.path.join(base_dir, "lazyqsar_comparison.csv")
df.to_csv(out_path, index=False)


# LazyQSAR v2.2

Compare the inclusion of CDDD descriptors in:
- Fast: morgan + rdkit
- Default: chemeleon + rdkit + cddd
- Slow: morgan + chemeleon +rdkit + cddd

And also:
- cddd_only (used as "fast" in the comparison, instead of morgan+rdkit)

In [2]:
dfs = []

scores = {}
headers = []

for k,v in benchmarks.items():
    scores[k] = [v]
headers += ["polaris"]

for mode in ["fast","default", "slow"]:
    for v in ["v2", "v2.2"]:
        base_dir = os.path.join("..", "results", f"lazyqsar_{v}")
        if v=="v2":
            path = os.path.join(base_dir, "lazyqsar_rdkit", mode, "summary_scores.csv")
        else:
            path = os.path.join(base_dir, mode, "summary_scores.csv")

        if os.path.exists(path):
            headers += [f"{v}_{mode}"]
            df = pd.read_csv(path)
            df.set_index("benchmark", inplace=True)
            print(df.head())
            for k in scores.keys():
                if k in df.index:
                    scores[k].append(df.loc[k, "score"])
                else:
                    scores[k].append(float("nan"))

df = pd.DataFrame.from_dict(scores, orient="index", columns=headers)
df.index.name = "benchmark"
df.reset_index(inplace=True)

out_path = os.path.join("..", "results", "lazyqsar_v2.2", "lazyqsar_comparison.csv")
df.to_csv(out_path, index=False)

                                 score
benchmark                             
tdcommons/bioavailability-ma  0.682075
tdcommons/hia-hou             0.996296
tdcommons/pgp-broccatelli     0.921221
tdcommons/bbb-martins              NaN
tdcommons/cyp2c9-veith        0.746172
                                 score
benchmark                             
tdcommons/bioavailability-ma  0.626372
tdcommons/hia-hou             0.823045
tdcommons/pgp-broccatelli     0.902293
tdcommons/bbb-martins         0.859619
tdcommons/cyp2c9-veith        0.758861
                                 score
benchmark                             
tdcommons/bioavailability-ma  0.733289
tdcommons/hia-hou             0.991770
tdcommons/pgp-broccatelli     0.904892
tdcommons/bbb-martins              NaN
tdcommons/cyp2c9-veith        0.751786
                                 score
benchmark                             
tdcommons/bioavailability-ma  0.693715
tdcommons/hia-hou             0.979012
tdcommons/pgp-broccatelli