# Prepare workshop datasets

- eos9yui - Natural product score
- eos6oli - Solubility
- eos4rta - MMV
- eos7yti - OSM
- eos4e41 - ChemProp antibiotic
- eos9ei3 - Synthetic accessibility
- eos3le9 - HepG2 cytotoxicity
- eos4tcc - Bayes hERG
- eos2l0q - Schistosomiasis

In [None]:
import pandas as pd
import numpy as np
import os

DATAPATH = "../data/m1_datasets"

In [None]:
from ersilia.utils.identifiers.compound import CompoundIdentifier

ci = CompoundIdentifier()

def model_filename(source, model_id):
    return os.path.join(DATAPATH, "precalculations", "{0}_50k_{1}.csv".format(source, model_id))

def select_compounds(source):
    n = 800
    assert source in ["chembl", "coconut"]
    file_name = model_filename(source, "eos4rta")
    df = pd.read_csv(file_name)
    inchikeys = list(df["key"])
    col = "NF54_IC50_72h_1uM"
    df = df.sort_values(by = col, ascending=False).reset_index(drop=True).head(n)
    keys_sets = []
    keys_sets += [set(df["key"])]
    if source == "coconut":
        n = 12000
    else:
        n = 17000
    file_name = model_filename(source, "eos6oli")
    df = pd.read_csv(file_name)
    df = df.sort_values(by = "solubility", ascending=False).reset_index(drop=True)
    keys_sets += [set(df.head(n)["key"]).union(set(df.tail(n)["key"]))]
    file_name = model_filename(source, "eos4e41")
    df = pd.read_csv(file_name)
    df = df.sort_values(by = "50uM_Inhibition", ascending=False).reset_index(drop=True)
    keys_sets += [set(df.head(n)["key"]).union(set(df.tail(n)["key"]))]
    file_name = model_filename(source, "eos9ei3")
    df = pd.read_csv(file_name)
    df = df.sort_values(by = "sa_score", ascending=False).reset_index(drop=True)
    keys_sets += [set(df.head(n)["key"]).union(set(df.tail(n)["key"]))]
    file_name = model_filename(source, "eos2l0q")
    df = pd.read_csv(file_name)
    df["key"] = inchikeys
    df = df.sort_values(by = "NTS_90perc_10uM", ascending=False).reset_index(drop=True)
    keys_sets += [set(df.head(n)["key"]).union(set(df.tail(n)["key"]))]
    keys = keys_sets[0]
    for ks in keys_sets[1:]:
        keys = keys.intersection(ks)
    print(len(keys))
    file_name = model_filename(source, "eos4rta")
    df = pd.read_csv(file_name)[["key", "input"]]
    more_keys = set(df["key"].sample(int(len(keys)*1.8)))
    keys = keys.union(more_keys)
    print(len(keys))
    df = df[df["key"].isin(keys)].reset_index(drop=True)
    df = df.rename(columns = {"key": "inchikey", "input": "smiles"})
    df.to_csv(os.path.join(DATAPATH,"{0}_selected.csv".format(source)), index=False)


select_compounds("coconut")
select_compounds("chembl")

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

df = pd.read_csv(os.path.join(DATAPATH,"precalculations/chembl_50k_9yui.csv"))
plt.hist()

In [None]:
def filter_predictions_by_selected(source, model_id):
    sel_filename = "{0}_selected.csv".format(source)
    full_filename = "precalculations/{0}_50k_{1}.csv".format(source, model_id)
    keys = set(pd.read_csv(sel_filename)["inchikey"])
    df = pd.read_csv(full_filename)
    df = df[df["key"].isin(keys)].reset_index(drop=True)
    out_filename = os.path.join(DATAPATH,"precalculations/{0}_selected_{1}_predictions.csv".format(source, model_id))
    df.to_csv(out_filename, index=False)

In [None]:
models = ["eos4rta","eos7yti", "eos4e41","eos2l0q",
"eos3le9", "eos4tcc","eos6oli","eos9yui","eos9ei3"]

for m in models:
    filter_predictions_by_selected("chembl", m)