# Prepare workshop datasets

- eos9yui - Natural product score
- eos6oli - Solubility
- eos4rta - MMV
- eos7yti - OSM
- eos4e41 - ChemProp antibiotic
- eos9ei3 - Synthetic accessibility
- eos3le9 - HepG2 cytotoxicity
- eos4tcc - Bayes hERG
- eos2l0q - Schistosomiasis

In [32]:
import pandas as pd
import os
from ersilia.utils.identifiers.compound import CompoundIdentifier

ci = CompoundIdentifier()

def model_filename(source, model_id):
    return os.path.join("..", "data", "precalculations", "{0}_50k_{1}.csv".format(source, model_id))

def select_compounds(source):
    n = 8000
    assert source in ["chembl", "coconut"]
    file_name = model_filename(source, "eos4rta")
    df = pd.read_csv(file_name)
    inchikeys = list(df["key"])
    col = "NF54_IC50_72h_1uM"
    df = df.sort_values(by = col, ascending=False).reset_index(drop=True).head(n)
    keys_sets = []
    keys_sets += [set(df["key"])]
    file_name = model_filename(source, "eos6oli")
    df = pd.read_csv(file_name)
    df = df.sort_values(by = "solubility", ascending=False).reset_index(drop=True)
    keys_sets += [set(df.head(n)["key"]).union(set(df.tail(n)["key"]))]
    file_name = model_filename(source, "eos4e41")
    df = pd.read_csv(file_name)
    df = df.sort_values(by = "50uM_Inhibition", ascending=False).reset_index(drop=True)
    keys_sets += [set(df.head(n)["key"]).union(set(df.tail(n)["key"]))]
    file_name = model_filename(source, "eos9ei3")
    df = pd.read_csv(file_name)
    df = df.sort_values(by = "sa_score", ascending=False).reset_index(drop=True)
    keys_sets += [set(df.head(n)["key"]).union(set(df.tail(n)["key"]))]
    file_name = model_filename(source, "eos2l0q")
    df = pd.read_csv(file_name)
    df["key"] = inchikeys
    df = df.sort_values(by = "NTS_90perc_10uM", ascending=False).reset_index(drop=True)
    keys_sets += [set(df.head(n)["key"]).union(set(df.tail(n)["key"]))]
    keys = keys_sets[0]
    for ks in keys_sets[1:]:
        keys = keys.intersection(ks)
    print(len(keys))
    file_name = model_filename(source, "eos4rta")
    df = pd.read_csv(file_name)[["key", "input"]]
    more_keys = set(df["key"].sample(int(len(keys)*1.8)))
    keys = keys.union(more_keys)
    print(len(keys))
    df = df[df["key"].isin(keys)].reset_index(drop=True)
    df = df.rename(columns = {"key": "inchikey", "input": "smiles"})
    df.to_csv("{0}_selected.csv".format(source), index=False)


select_compounds("coconut")
select_compounds("chembl")

177
528
125
374


In [1]:
import numpy as np



In [3]:
%matplotlib inline

import matplolib.pyplot as plt

df = pd.read_csv("precalculations/chembl_50k_9yui.csv")
plt.hist()

ModuleNotFoundError: No module named 'matplotlib'

In [4]:
!pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.8.0-cp310-cp310-macosx_11_0_arm64.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hCollecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.1.1-cp310-cp310-macosx_11_0_arm64.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.4/232.4 kB[0m [31m702.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.42.1-cp310-cp310-macosx_10_9_universal2.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m236.1 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting kiwisolver>=1.0.1 (from matplotlib)
  Downloading kiwisolver-1.4.5-cp310-cp310-macosx_11_0_arm64.whl (66 kB)
[2K     [