In [2]:
import pandas as pd
import sys
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
from IPython.display import display, HTML
from scipy.stats import spearmanr
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
import zipfile
import random
import gzip
import sys
import h5py
import os

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.width", 2000)
pd.set_option("display.max_colwidth", None)

display(HTML("""
<style>
.dataframe td, .dataframe th {
    white-space: nowrap !important;
}
</style>
"""))

AttributeError: module 'numpy' has no attribute '__version__'

In [3]:
# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import DATAPATH, CONFIGPATH

# Load pathogen info
# pathogen_code = sys.argv[1]
pathogen_code = 'mtuberculosis'
df = pd.read_csv(os.path.join(CONFIGPATH, 'pathogens.csv'))
row = df.loc[df["code"].eq(pathogen_code)]
if row.empty: 
    raise SystemExit(f"Unknown code: {pathogen_code}")
pathogen = row.iloc[0]["pathogen"]

print("Step 14")

# Define output directory
OUTPUT = os.path.join(root, "..", "output")

# Shared columns
KEYS = ["assay_id", "activity_type", "unit"]

# Columns to take from datasets table
COLUMNS_DATASETS = ["equal", 'higher', 'lower', "target_type_curated_extra", "dataset_type", "cpds_qt", "min_", "p1", "p25", "p50", "p75", "p99", "max_", "pos_ql", "ratio_ql", "cpds_ql"]

NameError: name 'sys' is not defined

In [96]:
def load_all_gz_csvs_from_zip(zip_path):
    """Read all ``*.csv.gz`` members from a ZIP into DataFrames.

    Parameters
    ----------
    zip_path : str | pathlib.Path
        Path to the ZIP archive.

    Returns
    -------
    dict[str, pandas.DataFrame]
        Mapping of ZIP member name -> loaded DataFrame.
    """
    dfs = {}
    with zipfile.ZipFile(zip_path, "r") as z:
        for name in z.namelist():
            if name.endswith(".csv.gz"):
                with z.open(name) as f:
                    dfs[name] = pd.read_csv(f, compression="gzip")
    return dfs

def get_all_results_from_individual_modeling(INDIVIDUAL_LM, LABELS=['A', 'B', 'C', 'D']):
    """Collect best AUROC (>0.7) per (assay_id, activity_type, unit) for each label.

    Returns
    -------
    RESULTS : dict[str, dict[tuple, list]]
        Per label: (assay_id, activity_type, unit) -> [expert_cutoff, best_auroc],
        considering only rows with AUROC > 0.7.
    CONSIDERED_ASSAYS : dict[str, set[tuple]]
        Per label: set of all (assay_id, activity_type, unit) keys encountered
        (no AUROC threshold applied).
    """
    RESULTS, CONSIDERED_ASSAYS = {}, {}
    for LABEL in LABELS:
        RESULTS[LABEL] = {}
        CONSIDERED_ASSAYS[LABEL] = set()
        rows = INDIVIDUAL_LM[INDIVIDUAL_LM[LABEL]][["assay_id", "activity_type", "unit", "expert_cutoff", f"{LABEL}_AVG"]].values
        for assay_id, activity_type, unit, expert_cutoff, auroc in rows:
            key = (assay_id, activity_type, unit)
            CONSIDERED_ASSAYS[LABEL].add(key)
            if auroc > 0.7:
                if key not in RESULTS[LABEL]:
                    RESULTS[LABEL][key] = [expert_cutoff, auroc]
                elif auroc > RESULTS[LABEL][key][1]:
                    RESULTS[LABEL][key] = [expert_cutoff, auroc]
    return RESULTS, CONSIDERED_ASSAYS

def where_considered(key, LABELS, CONSIDERED_ASSAYS):
    """Return labels (semicolon-separated) where `key` was considered; else NaN."""
    considered = []
    for LABEL in LABELS:
        if key in CONSIDERED_ASSAYS[LABEL]:
            considered.append(LABEL)
    if len(considered) > 0:
        return ";".join(considered)
    else:
        return np.nan
    
def where_accepted(key, LABELS, ACCEPTED_ASSAYS):
    """Return labels (semicolon-separated) where `key` was accepted; else NaN."""
    accepted = []
    for LABEL in LABELS:
        if key in ACCEPTED_ASSAYS[LABEL]:
            accepted.append(LABEL)
    if len(accepted) > 0:
        return ";".join(accepted)
    else:
        return np.nan

def get_filtered_assay_master(assay_df, activity_type, unit, target_type_curated_extra, bao_label, strain):
    """Filter `assay_df` by metadata fields, treating non-string `unit` as missing (NaN)."""
    if type(unit) == str:
        if type(strain) == str:
            df = assay_df[(assay_df['activity_type'] == activity_type) & 
                        (assay_df['unit'] == unit) &
                        (assay_df['target_type_curated_extra'] == target_type_curated_extra) &
                        (assay_df['bao_label'] == bao_label) &
                        (assay_df['strain'] == strain)]
        else:
            df = assay_df[(assay_df['activity_type'] == activity_type) & 
                        (assay_df['unit'] == unit) &
                        (assay_df['target_type_curated_extra'] == target_type_curated_extra) &
                        (assay_df['bao_label'] == bao_label) &
                        (assay_df['strain'].isna())]
    else:
        if type(strain) == str:
            df = assay_df[(assay_df['activity_type'] == activity_type) & 
                        (assay_df['unit'].isna()) &
                        (assay_df['target_type_curated_extra'] == target_type_curated_extra) &
                        (assay_df['bao_label'] == bao_label) &
                        (assay_df['strain'] == strain)]
        else:
            df = assay_df[(assay_df['activity_type'] == activity_type) & 
                        (assay_df['unit'].isna()) &
                        (assay_df['target_type_curated_extra'] == target_type_curated_extra) &
                        (assay_df['bao_label'] == bao_label) &
                        (assay_df['strain'].isna())]
    return df

def load_expert_cutoffs(CONFIGPATH):
    """
    Load expert cutoffs from the manual curation CSV and return them as a dictionary.

    The CSV is expected at:
        {CONFIGPATH}/manual_curation/expert_cutoffs.csv

    The returned dictionary maps:
        (activity_type, unit, target_type, pathogen_code) -> expert_cutoff

    Parameters
    ----------
    CONFIGPATH : str
        Path to the config folder.

    Returns
    -------
    dict
        Dictionary of expert cutoffs keyed by
        (activity_type, unit, target_type, pathogen_code).
    """
    # Load expert cut-offs
    EXPERT_CUTOFFS = pd.read_csv(os.path.join(CONFIGPATH, "expert_cutoffs.csv"))

    EXPERT_CUTOFFS = {
        (a, b, c, d): [float(k) for k in e.split(";")]
        for a, b, c, d, e in EXPERT_CUTOFFS[
            ["activity_type", "unit", "target_type", "pathogen_code", "expert_cutoff"]
        ].values
    }

    return EXPERT_CUTOFFS

def load_ecfp_all(h5_path):
    """Load all ECFP (Morgan count) fingerprints.

    Parameters
    ----------
    h5_path : str
        Path to the HDF5 file containing datasets "SMILES" and "X_morgan".

    Returns
    -------
    dict[str, np.ndarray]
        Mapping {chembl_id: fingerprint (np.int8, shape (nBits,))}.
    """
    with h5py.File(h5_path, "r") as f:
        meta = f["SMILES"][:, 3].astype(str)
        fps  = f["X_morgan"][:]  # Load ALL

    return {cid: fp for cid, fp in zip(meta, fps)}

def KFoldTrain(X, Y, n_splits=4, n_estimators=100, random_state=42):
    """Stratified K-fold training/eval with RandomForest; returns mean AUROC and std.

    Parameters
    ----------
    X : np.ndarray
        Feature matrix (n_samples, n_features).
    Y : np.ndarray
        Binary labels (n_samples,).
    n_splits : int
        Number of folds.
    n_estimators : int
        Number of trees in the random forest.
    random_state : int
        RNG seed (also used for fold shuffling).

    Returns
    -------
    tuple[float, float]
        (mean_auroc, std_auroc) rounded to 3 decimals.
    """
    def init_RF():
        return RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features="sqrt",
            n_jobs=8,
            random_state=random_state,
        )

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    aurocs = []

    for train_idx, test_idx in skf.split(X, Y):
        X_train, X_test = X[train_idx], X[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]
        rf = init_RF()
        rf.fit(X_train, Y_train)
        y_prob = rf.predict_proba(X_test)[:, 1]
        aurocs.append(roc_auc_score(Y_test, y_prob))

    return round(float(np.mean(aurocs)), 3), round(float(np.std(aurocs)), 3)

def TrainRF(X, Y, n_estimators=100):
    """Train a RandomForestClassifier on all provided data and return the fitted model.

    Parameters
    ----------
    X : np.ndarray
        Feature matrix (n_samples, n_features).
    Y : np.ndarray
        Labels (n_samples,).

    Returns
    -------
    RandomForestClassifier
        Fitted classifier.
    """
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features="sqrt",
        n_jobs=8,
    )
    rf.fit(X, Y)
    return rf

RATIO = 0.1

# Set and create path to correlations
PATH_TO_CORRELATIONS = os.path.join(OUTPUT, pathogen_code, "correlations")
os.makedirs(PATH_TO_CORRELATIONS, exist_ok=True)

In [1]:
# Load assays info
print("Merging assay metadata")
ASSAYS_CLEANED = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_cleaned.csv"))
ASSAYS_PARAMETERS = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_parameters.csv"))
ASSAYS_DATASETS_ = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_datasets.csv"))
INDIVIDUAL_LM = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "individual_LM.csv"))

# Get assay to quantitative data info to collapse ASSAY_DATASETS_ (1 row per assay)
assay_to_qt_info = defaultdict(list)
for assay_id, activity_type, unit, expert_cutoff, ratio_qt in ASSAYS_DATASETS_[['assay_id', 'activity_type', 'unit', 'expert_cutoff', 'ratio_qt']].values:
    assay_to_qt_info[tuple([assay_id, activity_type, unit])].append([expert_cutoff, ratio_qt])

# Unique row per assay
ASSAYS_DATASETS = ASSAYS_DATASETS_[KEYS + COLUMNS_DATASETS].drop_duplicates().reset_index(drop=True)

# Get cutoffs and ratios
cutoffs = [";".join([str(j[0]) for j in assay_to_qt_info[tuple(i)]]) for i in ASSAYS_DATASETS[['assay_id', 'activity_type', 'unit']].values]
ratios = [";".join([str(j[1]) for j in assay_to_qt_info[tuple(i)]]) for i in ASSAYS_DATASETS[['assay_id', 'activity_type', 'unit']].values]
cutoffs = [i if i != 'nan' else np.nan for i in cutoffs]
ratios = [i if i != 'nan' else np.nan for i in ratios]

# Store results
ASSAYS_DATASETS.insert(8, 'cutoffs', cutoffs)
ASSAYS_DATASETS.insert(9, 'ratios', ratios)

# Merge everything
ASSAYS_MASTER = ASSAYS_CLEANED.merge(ASSAYS_PARAMETERS,on=KEYS, how="left", validate="1:1")
ASSAYS_MASTER = ASSAYS_MASTER.merge(ASSAYS_DATASETS,on=KEYS, how="left", validate="1:1")

# Dict mapping assay_id, activity_type and unit to a set of compound ChEMBL IDs
print("Mapping assays to compounds")
ChEMBL = pd.read_csv(os.path.join(OUTPUT, pathogen_code, f"{pathogen_code}_ChEMBL_cleaned_data.csv.gz"), low_memory=False)
ASSAY_TO_COMPOUNDS = defaultdict(set)
for assay_id, activity_type, unit, compound_chembl_id in ChEMBL[["assay_chembl_id", "activity_type", "unit", "compound_chembl_id"]].values:
    ASSAY_TO_COMPOUNDS[(assay_id, activity_type, unit)].add(compound_chembl_id)
del ChEMBL

# Loading quantitative and qualitative datasets
print("Loading individual datasets")
qt_zip = os.path.join(OUTPUT, pathogen_code, "datasets", "datasets_qt.zip")
ql_zip = os.path.join(OUTPUT, pathogen_code, "datasets", "datasets_ql.zip")
dfs_qt = load_all_gz_csvs_from_zip(qt_zip)
dfs_ql = load_all_gz_csvs_from_zip(ql_zip)
print("Loaded quantitative:", len(dfs_qt), "datasets")
print("Loaded qualitative:", len(dfs_ql), "datasets")

# Get results from individual modeling ABCD
LABELS = ['A', 'B', 'C', 'D']
ACCEPTED_ASSAYS, CONSIDERED_ASSAYS = get_all_results_from_individual_modeling(INDIVIDUAL_LM, LABELS)

col_accepted, col_considered = [], []
for assay_id, activity_type, unit in ASSAYS_MASTER[["assay_id", "activity_type", "unit"]].values:
    # Get strategies in which this assay is considered and accepted
    key = tuple([assay_id, activity_type, unit])
    col_considered.append(where_considered(key, LABELS, CONSIDERED_ASSAYS))
    col_accepted.append(where_accepted(key, LABELS, ACCEPTED_ASSAYS))
ASSAYS_MASTER['Accepted'] = col_accepted
ASSAYS_MASTER['Considered'] = col_considered

# Reorder columns
ALL_COLS = ["assay_id", "assay_type", "assay_organism", "target_organism", "organism_curated", "doc_chembl_id", "target_type", "target_type_curated", "target_type_curated_extra", 
          "target_chembl_id", "target_chembl_id_curated", "target_name_curated", "bao_label", "source_label", "strain", "atcc_id", "mutations", "known_drug_resistances", "media",
          "activity_type", "unit", "activities", "nan_values", "cpds", "frac_cs", "direction", "act_flag", 'inact_flag', "equal", "higher", "lower", "dataset_type", "cutoffs", "ratios", 
          "cpds_qt", "pos_ql", "ratio_ql", "cpds_ql", "min_", "p1", "p25", "p50", "p75", "p99", "max_", 'Accepted', 'Considered']
ASSAYS_MASTER = ASSAYS_MASTER[ALL_COLS]

# Get accepted assays and accepted compounds in ABCD
accepted_assays = ASSAYS_MASTER[(ASSAYS_MASTER['Accepted'].isna() == False)][['assay_id', 'activity_type', 'unit']].values
accepted_compounds = set([j for i in accepted_assays for j in ASSAY_TO_COMPOUNDS[tuple(i)]])

# Loading Reference set of compounds
REFERENCE_SET = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "reference_set.csv.gz"))['reference_smiles'].tolist()

# Get all compounds from pathogen
compounds = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "compound_counts.csv.gz"))
compounds = set(compounds['compound_chembl_id'])

# Load expert cut-offs
EXPERT_CUTOFFS = load_expert_cutoffs(CONFIGPATH)

# Loading Morgan fingerprints
print("Loading ECFPs...")
PATH_TO_ECFPs = os.path.join(DATAPATH, "chembl_processed", "ChEMBL_ECFPs.h5")
ecfps = load_ecfp_all(PATH_TO_ECFPs)

# Get ChEMBL compounds not tested against the pathogen
DECOYS_CHEMBL = set([i for i in ecfps if i not in compounds])

# Prepare reference matrix of Morgan fingerprints
X_REF = np.array([ecfps[cid] for cid in REFERENCE_SET if cid in ecfps])

Merging assay metadata


NameError: name 'pd' is not defined

In [61]:
def to_merge_unique_cpds(df, group_keys, assay_to_compounds):
    """
    Group assays by `group_keys` and compute:
      - n_assays: number of unique assays in the group
      - n_cpds_union: number of unique compounds across assays (set union)
      - assay_keys: ';'-separated tuple strings "(assay_id, activity_type, unit)" (last column)
    """

    def collect_assay_keys(block):
        """Return unique (assay_id, activity_type, unit) keys for this group."""
        keys = sorted({tuple(r) for r in block.values})
        return keys  # list[tuple]

    def union_size(keys):
        """Return size of union of compounds for the given assay keys."""
        u = set()
        for k in keys:
            u |= assay_to_compounds.get(k, set())
        return len(u)

    out = (df.groupby(group_keys, dropna=False)[["assay_id", "activity_type", "unit"]]
             .apply(collect_assay_keys)
             .reset_index(name="assay_keys"))

    out["n_assays"] = out["assay_keys"].apply(len)
    out["n_cpds_union"] = out["assay_keys"].apply(union_size)

    # store as ';'-separated tuple strings (easy round-trip via ast.literal_eval)
    out["assay_keys"] = out["assay_keys"].apply(lambda ks: ";".join(map(str, ks)))

    # make assay_keys the last column
    cols = [c for c in out.columns if c != "assay_keys"] + ["assay_keys"]
    out = out[cols]

    return out.sort_values("n_cpds_union", ascending=False).reset_index(drop=True)


In [None]:
# Filtering assays
print("Identifying potential assays to merge")
print("Organisms...")
keys_organism = ["activity_type", "unit", "target_type_curated_extra", "bao_label", "strain"]
FILTERED_ASSAYS_ORGANISM = ASSAYS_MASTER[(ASSAYS_MASTER['Accepted'].isna()) & (ASSAYS_MASTER['target_type_curated_extra'] == 'ORGANISM')].copy()
TO_MERGE_ORGANISM = to_merge_unique_cpds(FILTERED_ASSAYS_ORGANISM, keys_organism, ASSAY_TO_COMPOUNDS)

print("Single proteins...")
keys_single_protein = ["activity_type", "unit", "target_type_curated_extra", "bao_label", "strain", 'target_chembl_id']  # target name
FILTERED_ASSAYS_SINGLE_PROTEIN = ASSAYS_MASTER[(ASSAYS_MASTER['Accepted'].isna()) & (ASSAYS_MASTER['target_type_curated_extra'] == 'SINGLE PROTEIN')].copy()
TO_MERGE_SINGLE_PROTEIN = to_merge_unique_cpds(FILTERED_ASSAYS_SINGLE_PROTEIN, keys_organism, ASSAY_TO_COMPOUNDS)

# Filtering only activity type - unit pairs relevant for merging
TO_MERGE_ORGANISM = TO_MERGE_ORGANISM[(TO_MERGE_ORGANISM['n_cpds_union'] > 1000) & (TO_MERGE_ORGANISM['n_assays'] > 1)]
TO_MERGE_SINGLE_PROTEIN = TO_MERGE_SINGLE_PROTEIN[(TO_MERGE_SINGLE_PROTEIN['n_cpds_union'] > 1000) & (TO_MERGE_SINGLE_PROTEIN['n_assays'] > 1)]

Identifying potential assays to merge
Organisms...
Single proteins...
Loading ECFPs...


In [103]:
MERGED_COMPOUNDS = []
DATA = {"ORGANISM": TO_MERGE_ORGANISM,
        "SINGLE PROTEIN": TO_MERGE_SINGLE_PROTEIN}

for target_type in ['SINGLE PROTEIN']:

    print(target_type)

    # Copy df
    data_target_type = DATA[target_type].copy()

    # Iterate over activity_type, unit
    for merging in data_target_type.itertuples():

        # Get data
        activity_type = merging.activity_type
        unit = merging.unit
        target_type_curated_extra = merging.target_type_curated_extra
        bao_label = merging.bao_label
        strain = merging.strain

        # Filter master table
        if target_type == 'ORGANISM':
            df = get_filtered_assay_master(FILTERED_ASSAYS_ORGANISM, activity_type, unit, target_type_curated_extra, bao_label, strain)
        else:
            df = get_filtered_assay_master(FILTERED_ASSAYS_SINGLE_PROTEIN, activity_type, unit, target_type_curated_extra, bao_label, strain)


        # Get quantitative and qualitative
        df_quant = df[(df['dataset_type'] == 'quantitative') | (df['dataset_type'] == 'mixed')].reset_index(drop=True)
        df_qual = df[(df['dataset_type'] == 'qualitative') | (df['dataset_type'] == 'mixed')].reset_index(drop=True)

        if len(df_quant) > 0:

            # QUANTITATIVE
            # For each expert cut-off
            for expert_cutoff in EXPERT_CUTOFFS[(activity_type, unit, target_type_curated_extra, pathogen_code)]:
                
                # Concatenate all files/data together
                assays = df_quant['assay_id'].tolist()
                files = [f"{i}_{activity_type}_{unit}_qt_{expert_cutoff}.csv.gz" for i in assays]
                data = [dfs_qt[f].assign(assay_id=a) for a, f in zip(assays, files)]
                data = pd.concat(data, ignore_index=True)
                data = data.sort_values("value", ascending=True).drop_duplicates("compound_chembl_id", keep="first").reset_index(drop=True)
                
                # Prepare matrices for training
                X = np.array(data['compound_chembl_id'].map(ecfps).to_list())
                Y = np.array(data['bin'].tolist())

                if sum(Y) > 50:
                    print(f"Merging ... Activity type: {activity_type}, Unit: {unit}, Cutoff: {expert_cutoff}, Strain {strain}")
                    print(f"\tCompounds: {len(X)}", f"Positives: {sum(Y)} ({round(100 * sum(Y) / len(Y), 1)}%)")
                    if sum(Y) / len(Y) > 0.5:
                        print(f"\tRatio too high: Adding random compounds from ChEMBL as decoys")
                        DECOYS = int(sum(Y) / RATIO - (len(Y) - 1))
                        print(f"\t{DECOYS} added decoys")
                        rng = random.Random(42)
                        DECOYS = rng.sample(list(DECOYS_CHEMBL), DECOYS)
                        X_decoys = np.array([ecfps[i] for i in DECOYS])
                        X = np.vstack([X, X_decoys])
                        Y = np.concatenate([Y, np.zeros(len(X_decoys), dtype=Y.dtype)])
                        print(f"\tCompounds: {len(X)}", f"Positives: {sum(Y)} ({round(100 * sum(Y) / len(Y),3)}%)")
                    # 4Fold Cros Validation
                    average_auroc, stds = KFoldTrain(X, Y, n_splits=5, n_estimators=100)
                    print(f"\tMean AUROC: {average_auroc} ± {stds}")
                    if average_auroc > 0.7:
                        MERGED_COMPOUNDS.extend(data['compound_chembl_id'].tolist())
                        RF = TrainRF(X, Y, n_estimators=100)
                        y_prob_ref = RF.predict_proba(X_REF)[:, 1]
                        os.makedirs(os.path.join(PATH_TO_CORRELATIONS, "M"), exist_ok=True)
                        filename = f'{activity_type}_{unit}_{target_type_curated_extra.replace(" ", "-")}_{bao_label.replace(" ", "-")}_{str(strain).replace(" ", "-")}_qt_{expert_cutoff}_ref_probs.npz' 
                        np.savez_compressed(os.path.join(PATH_TO_CORRELATIONS, "M", filename), y_prob_ref=y_prob_ref)

                else:
                    print(f"Too few positive compounds... ({sum(Y)})")

        elif len(df_qual) > 0:

            # QUALITATIVE
            # Concatenate all files/data together
            assays = df_qual['assay_id'].tolist()
            files = [f"{i}_{activity_type}_{unit}_ql.csv.gz" for i in assays]
            data = [dfs_ql[f].assign(assay_id=a) for a, f in zip(assays, files)]
            data = pd.concat(data, ignore_index=True)
            data = data.sort_values("value", ascending=True).drop_duplicates("compound_chembl_id", keep="first").reset_index(drop=True)
            
            # Prepare matrices for training
            X = np.array(data['compound_chembl_id'].map(ecfps).to_list())
            Y = np.array(data['bin'].tolist())

            if sum(Y) > 50:
                print(f"Merging ... Activity type: {activity_type}, Unit: {unit}, Cutoff: {expert_cutoff}, Strain {strain}")
                print(f"\tCompounds: {len(X)}", f"Positives: {sum(Y)} ({round(100 * sum(Y) / len(Y), 1)}%)")
                if sum(Y) / len(Y) > 0.5:
                    print(f"\tRatio too high: Adding random compounds from ChEMBL as decoys")
                    DECOYS = int(sum(Y) / RATIO - (len(Y) - 1))
                    print(f"\t{DECOYS} added decoys")
                    rng = random.Random(42)
                    DECOYS = rng.sample(list(DECOYS_CHEMBL), DECOYS)
                    X_decoys = np.array([ecfps[i] for i in DECOYS])
                    X = np.vstack([X, X_decoys])
                    Y = np.concatenate([Y, np.zeros(len(X_decoys), dtype=Y.dtype)])
                    print(f"\tCompounds: {len(X)}", f"Positives: {sum(Y)} ({round(100 * sum(Y) / len(Y),3)}%)")
                # 4Fold Cros Validation
                average_auroc, stds = KFoldTrain(X, Y, n_splits=5, n_estimators=100)
                print(f"\tMean AUROC: {average_auroc} ± {stds}")
                if average_auroc > 0.7:
                    MERGED_COMPOUNDS.extend(data['compound_chembl_id'].tolist())
                    RF = TrainRF(X, Y, n_estimators=100)
                    y_prob_ref = RF.predict_proba(X_REF)[:, 1]
                    os.makedirs(os.path.join(PATH_TO_CORRELATIONS, "M"), exist_ok=True)
                    filename = f'{activity_type}_{unit}_{target_type_curated_extra.replace(" ", "-")}_{bao_label.replace(" ", "-")}_{str(strain).replace(" ", "-")}_ql_ref_probs.npz' 
                    np.savez_compressed(os.path.join(PATH_TO_CORRELATIONS, "M", filename), y_prob_ref=y_prob_ref)

        else:
            raise TypeError("df_quant and df_qual are empty...")
        
        break

SINGLE PROTEIN
Too few positive compounds... (0)
Too few positive compounds... (0)
Too few positive compounds... (0)


In [108]:
TO_MERGE_SINGLE_PROTEIN

Unnamed: 0,activity_type,unit,target_type_curated_extra,bao_label,strain,n_assays,n_cpds_union,assay_keys
0,PERCENTEFFECT,%,SINGLE PROTEIN,assay format,H37Rv,5,68613,"('CHEMBL4649941', 'PERCENTEFFECT', '%');('CHEMBL4649947', 'PERCENTEFFECT', '%');('CHEMBL4649957', 'PERCENTEFFECT', '%');('CHEMBL4649965', 'PERCENTEFFECT', '%');('CHEMBL4649972', 'PERCENTEFFECT', '%')"
1,IC50,umol.L-1,SINGLE PROTEIN,assay format,H37Rv,104,2793,"('CHEMBL1047043', 'IC50', 'umol.L-1');('CHEMBL1047044', 'IC50', 'umol.L-1');('CHEMBL1047045', 'IC50', 'umol.L-1');('CHEMBL1047046', 'IC50', 'umol.L-1');('CHEMBL1071686', 'IC50', 'umol.L-1');('CHEMBL1220932', 'IC50', 'umol.L-1');('CHEMBL1220933', 'IC50', 'umol.L-1');('CHEMBL1274290', 'IC50', 'umol.L-1');('CHEMBL1920120', 'IC50', 'umol.L-1');('CHEMBL1943105', 'IC50', 'umol.L-1');('CHEMBL2033568', 'IC50', 'umol.L-1');('CHEMBL2033572', 'IC50', 'umol.L-1');('CHEMBL2344503', 'IC50', 'umol.L-1');('CHEMBL3090429', 'IC50', 'umol.L-1');('CHEMBL3117257', 'IC50', 'umol.L-1');('CHEMBL3227733', 'IC50', 'umol.L-1');('CHEMBL3238750', 'IC50', 'umol.L-1');('CHEMBL3294030', 'IC50', 'umol.L-1');('CHEMBL3366298', 'IC50', 'umol.L-1');('CHEMBL3367585', 'IC50', 'umol.L-1');('CHEMBL3386588', 'IC50', 'umol.L-1');('CHEMBL3386589', 'IC50', 'umol.L-1');('CHEMBL3386590', 'IC50', 'umol.L-1');('CHEMBL3387253', 'IC50', 'umol.L-1');('CHEMBL3390735', 'IC50', 'umol.L-1');('CHEMBL3405737', 'IC50', 'umol.L-1');('CHEMBL3421148', 'IC50', 'umol.L-1');('CHEMBL3582757', 'IC50', 'umol.L-1');('CHEMBL3610297', 'IC50', 'umol.L-1');('CHEMBL3630322', 'IC50', 'umol.L-1');('CHEMBL3737110', 'IC50', 'umol.L-1');('CHEMBL3745429', 'IC50', 'umol.L-1');('CHEMBL3755401', 'IC50', 'umol.L-1');('CHEMBL3758003', 'IC50', 'umol.L-1');('CHEMBL3769290', 'IC50', 'umol.L-1');('CHEMBL3865749', 'IC50', 'umol.L-1');('CHEMBL3875318', 'IC50', 'umol.L-1');('CHEMBL3875323', 'IC50', 'umol.L-1');('CHEMBL3997430', 'IC50', 'umol.L-1');('CHEMBL4004348', 'IC50', 'umol.L-1');('CHEMBL4020311', 'IC50', 'umol.L-1');('CHEMBL4028447', 'IC50', 'umol.L-1');('CHEMBL4050695', 'IC50', 'umol.L-1');('CHEMBL4136233', 'IC50', 'umol.L-1');('CHEMBL4146152', 'IC50', 'umol.L-1');('CHEMBL4146154', 'IC50', 'umol.L-1');('CHEMBL4201103', 'IC50', 'umol.L-1');('CHEMBL4201104', 'IC50', 'umol.L-1');('CHEMBL4201108', 'IC50', 'umol.L-1');('CHEMBL4201109', 'IC50', 'umol.L-1');('CHEMBL4201110', 'IC50', 'umol.L-1');('CHEMBL4256093', 'IC50', 'umol.L-1');('CHEMBL4371338', 'IC50', 'umol.L-1');('CHEMBL4383980', 'IC50', 'umol.L-1');('CHEMBL4383981', 'IC50', 'umol.L-1');('CHEMBL4394894', 'IC50', 'umol.L-1');('CHEMBL4434514', 'IC50', 'umol.L-1');('CHEMBL4481836', 'IC50', 'umol.L-1');('CHEMBL4649941', 'IC50', 'umol.L-1');('CHEMBL4649947', 'IC50', 'umol.L-1');('CHEMBL4649957', 'IC50', 'umol.L-1');('CHEMBL4649965', 'IC50', 'umol.L-1');('CHEMBL4649972', 'IC50', 'umol.L-1');('CHEMBL4680629', 'IC50', 'umol.L-1');('CHEMBL4680630', 'IC50', 'umol.L-1');('CHEMBL4831382', 'IC50', 'umol.L-1');('CHEMBL4832952', 'IC50', 'umol.L-1');('CHEMBL5050652', 'IC50', 'umol.L-1');('CHEMBL5104624', 'IC50', 'umol.L-1');('CHEMBL5105160', 'IC50', 'umol.L-1');('CHEMBL5107313', 'IC50', 'umol.L-1');('CHEMBL5139608', 'IC50', 'umol.L-1');('CHEMBL5146984', 'IC50', 'umol.L-1');('CHEMBL5226822', 'IC50', 'umol.L-1');('CHEMBL5226832', 'IC50', 'umol.L-1');('CHEMBL5262988', 'IC50', 'umol.L-1');('CHEMBL5375438', 'IC50', 'umol.L-1');('CHEMBL5379299', 'IC50', 'umol.L-1');('CHEMBL5379301', 'IC50', 'umol.L-1');('CHEMBL5379305', 'IC50', 'umol.L-1');('CHEMBL5379309', 'IC50', 'umol.L-1');('CHEMBL5379329', 'IC50', 'umol.L-1');('CHEMBL5379342', 'IC50', 'umol.L-1');('CHEMBL5379344', 'IC50', 'umol.L-1');('CHEMBL5379350', 'IC50', 'umol.L-1');('CHEMBL5379351', 'IC50', 'umol.L-1');('CHEMBL5379353', 'IC50', 'umol.L-1');('CHEMBL5379354', 'IC50', 'umol.L-1');('CHEMBL5379357', 'IC50', 'umol.L-1');('CHEMBL5379358', 'IC50', 'umol.L-1');('CHEMBL5379359', 'IC50', 'umol.L-1');('CHEMBL5379362', 'IC50', 'umol.L-1');('CHEMBL5379363', 'IC50', 'umol.L-1');('CHEMBL5379365', 'IC50', 'umol.L-1');('CHEMBL5379366', 'IC50', 'umol.L-1');('CHEMBL5379367', 'IC50', 'umol.L-1');('CHEMBL5379368', 'IC50', 'umol.L-1');('CHEMBL5379369', 'IC50', 'umol.L-1');('CHEMBL5379371', 'IC50', 'umol.L-1');('CHEMBL5528588', 'IC50', 'umol.L-1');('CHEMBL5622563', 'IC50', 'umol.L-1');('CHEMBL5622570', 'IC50', 'umol.L-1');('CHEMBL837765', 'IC50', 'umol.L-1');('CHEMBL867488', 'IC50', 'umol.L-1')"
2,IC50,umol.L-1,SINGLE PROTEIN,assay format,,219,1831,"('CHEMBL1014575', 'IC50', 'umol.L-1');('CHEMBL1047048', 'IC50', 'umol.L-1');('CHEMBL1059883', 'IC50', 'umol.L-1');('CHEMBL1167757', 'IC50', 'umol.L-1');('CHEMBL1273331', 'IC50', 'umol.L-1');('CHEMBL1280245', 'IC50', 'umol.L-1');('CHEMBL1280246', 'IC50', 'umol.L-1');('CHEMBL1280247', 'IC50', 'umol.L-1');('CHEMBL1280248', 'IC50', 'umol.L-1');('CHEMBL1280249', 'IC50', 'umol.L-1');('CHEMBL1280251', 'IC50', 'umol.L-1');('CHEMBL1280252', 'IC50', 'umol.L-1');('CHEMBL1280253', 'IC50', 'umol.L-1');('CHEMBL1280254', 'IC50', 'umol.L-1');('CHEMBL1284665', 'IC50', 'umol.L-1');('CHEMBL1284666', 'IC50', 'umol.L-1');('CHEMBL1640592', 'IC50', 'umol.L-1');('CHEMBL1775824', 'IC50', 'umol.L-1');('CHEMBL1775825', 'IC50', 'umol.L-1');('CHEMBL1803417', 'IC50', 'umol.L-1');('CHEMBL1803584', 'IC50', 'umol.L-1');('CHEMBL1819921', 'IC50', 'umol.L-1');('CHEMBL1819932', 'IC50', 'umol.L-1');('CHEMBL1914227', 'IC50', 'umol.L-1');('CHEMBL1948969', 'IC50', 'umol.L-1');('CHEMBL1949382', 'IC50', 'umol.L-1');('CHEMBL2020901', 'IC50', 'umol.L-1');('CHEMBL2027585', 'IC50', 'umol.L-1');('CHEMBL2027818', 'IC50', 'umol.L-1');('CHEMBL2027819', 'IC50', 'umol.L-1');('CHEMBL2049310', 'IC50', 'umol.L-1');('CHEMBL2049311', 'IC50', 'umol.L-1');('CHEMBL2049312', 'IC50', 'umol.L-1');('CHEMBL2049313', 'IC50', 'umol.L-1');('CHEMBL2049314', 'IC50', 'umol.L-1');('CHEMBL2049315', 'IC50', 'umol.L-1');('CHEMBL2049316', 'IC50', 'umol.L-1');('CHEMBL2049317', 'IC50', 'umol.L-1');('CHEMBL2049924', 'IC50', 'umol.L-1');('CHEMBL2060781', 'IC50', 'umol.L-1');('CHEMBL2060782', 'IC50', 'umol.L-1');('CHEMBL2060783', 'IC50', 'umol.L-1');('CHEMBL2060838', 'IC50', 'umol.L-1');('CHEMBL2060843', 'IC50', 'umol.L-1');('CHEMBL2060844', 'IC50', 'umol.L-1');('CHEMBL2155131', 'IC50', 'umol.L-1');('CHEMBL2163154', 'IC50', 'umol.L-1');('CHEMBL2209291', 'IC50', 'umol.L-1');('CHEMBL2317721', 'IC50', 'umol.L-1');('CHEMBL2317722', 'IC50', 'umol.L-1');('CHEMBL2330175', 'IC50', 'umol.L-1');('CHEMBL2352780', 'IC50', 'umol.L-1');('CHEMBL2383452', 'IC50', 'umol.L-1');('CHEMBL2383453', 'IC50', 'umol.L-1');('CHEMBL2383454', 'IC50', 'umol.L-1');('CHEMBL2383455', 'IC50', 'umol.L-1');('CHEMBL2389281', 'IC50', 'umol.L-1');('CHEMBL2400180', 'IC50', 'umol.L-1');('CHEMBL2423608', 'IC50', 'umol.L-1');('CHEMBL2429128', 'IC50', 'umol.L-1');('CHEMBL3089491', 'IC50', 'umol.L-1');('CHEMBL3123111', 'IC50', 'umol.L-1');('CHEMBL3136322', 'IC50', 'umol.L-1');('CHEMBL3222350', 'IC50', 'umol.L-1');('CHEMBL3222354', 'IC50', 'umol.L-1');('CHEMBL3222410', 'IC50', 'umol.L-1');('CHEMBL3222880', 'IC50', 'umol.L-1');('CHEMBL3224633', 'IC50', 'umol.L-1');('CHEMBL3224681', 'IC50', 'umol.L-1');('CHEMBL3226681', 'IC50', 'umol.L-1');('CHEMBL3267771', 'IC50', 'umol.L-1');('CHEMBL3269011', 'IC50', 'umol.L-1');('CHEMBL3269012', 'IC50', 'umol.L-1');('CHEMBL3271706', 'IC50', 'umol.L-1');('CHEMBL3294711', 'IC50', 'umol.L-1');('CHEMBL3364655', 'IC50', 'umol.L-1');('CHEMBL3365136', 'IC50', 'umol.L-1');('CHEMBL3372573', 'IC50', 'umol.L-1');('CHEMBL3373189', 'IC50', 'umol.L-1');('CHEMBL3376144', 'IC50', 'umol.L-1');('CHEMBL3376775', 'IC50', 'umol.L-1');('CHEMBL3380680', 'IC50', 'umol.L-1');('CHEMBL3385192', 'IC50', 'umol.L-1');('CHEMBL3389875', 'IC50', 'umol.L-1');('CHEMBL3398982', 'IC50', 'umol.L-1');('CHEMBL3405426', 'IC50', 'umol.L-1');('CHEMBL3411493', 'IC50', 'umol.L-1');('CHEMBL3428753', 'IC50', 'umol.L-1');('CHEMBL3591923', 'IC50', 'umol.L-1');('CHEMBL3591924', 'IC50', 'umol.L-1');('CHEMBL3598586', 'IC50', 'umol.L-1');('CHEMBL3610476', 'IC50', 'umol.L-1');('CHEMBL3742595', 'IC50', 'umol.L-1');('CHEMBL3745400', 'IC50', 'umol.L-1');('CHEMBL3748286', 'IC50', 'umol.L-1');('CHEMBL3758009', 'IC50', 'umol.L-1');('CHEMBL3771811', 'IC50', 'umol.L-1');('CHEMBL3783624', 'IC50', 'umol.L-1');('CHEMBL3790381', 'IC50', 'umol.L-1');('CHEMBL3790390', 'IC50', 'umol.L-1');('CHEMBL3819980', 'IC50', 'umol.L-1');('CHEMBL3830492', 'IC50', 'umol.L-1');('CHEMBL3854458', 'IC50', 'umol.L-1');('CHEMBL3855802', 'IC50', 'umol.L-1');('CHEMBL3861815', 'IC50', 'umol.L-1');('CHEMBL3862252', 'IC50', 'umol.L-1');('CHEMBL3862339', 'IC50', 'umol.L-1');('CHEMBL3866222', 'IC50', 'umol.L-1');('CHEMBL3870634', 'IC50', 'umol.L-1');('CHEMBL3870635', 'IC50', 'umol.L-1');('CHEMBL3876018', 'IC50', 'umol.L-1');('CHEMBL3997432', 'IC50', 'umol.L-1');('CHEMBL3997433', 'IC50', 'umol.L-1');('CHEMBL4013037', 'IC50', 'umol.L-1');('CHEMBL4024193', 'IC50', 'umol.L-1');('CHEMBL4026338', 'IC50', 'umol.L-1');('CHEMBL4037133', 'IC50', 'umol.L-1');('CHEMBL4038353', 'IC50', 'umol.L-1');('CHEMBL4046584', 'IC50', 'umol.L-1');('CHEMBL4046585', 'IC50', 'umol.L-1');('CHEMBL4050696', 'IC50', 'umol.L-1');('CHEMBL4156198', 'IC50', 'umol.L-1');('CHEMBL4187651', 'IC50', 'umol.L-1');('CHEMBL4187660', 'IC50', 'umol.L-1');('CHEMBL4187899', 'IC50', 'umol.L-1');('CHEMBL4198722', 'IC50', 'umol.L-1');('CHEMBL4222533', 'IC50', 'umol.L-1');('CHEMBL4224118', 'IC50', 'umol.L-1');('CHEMBL4355041', 'IC50', 'umol.L-1');('CHEMBL4364014', 'IC50', 'umol.L-1');('CHEMBL4371340', 'IC50', 'umol.L-1');('CHEMBL4372869', 'IC50', 'umol.L-1');('CHEMBL4376610', 'IC50', 'umol.L-1');('CHEMBL4383585', 'IC50', 'umol.L-1');('CHEMBL4383977', 'IC50', 'umol.L-1');('CHEMBL4384276', 'IC50', 'umol.L-1');('CHEMBL4384698', 'IC50', 'umol.L-1');('CHEMBL4388634', 'IC50', 'umol.L-1');('CHEMBL4404083', 'IC50', 'umol.L-1');('CHEMBL4405868', 'IC50', 'umol.L-1');('CHEMBL4407818', 'IC50', 'umol.L-1');('CHEMBL4415804', 'IC50', 'umol.L-1');('CHEMBL4415805', 'IC50', 'umol.L-1');('CHEMBL4416097', 'IC50', 'umol.L-1');('CHEMBL4481956', 'IC50', 'umol.L-1');('CHEMBL4618720', 'IC50', 'umol.L-1');('CHEMBL4618722', 'IC50', 'umol.L-1');('CHEMBL4618723', 'IC50', 'umol.L-1');('CHEMBL4618724', 'IC50', 'umol.L-1');('CHEMBL4618730', 'IC50', 'umol.L-1');('CHEMBL4618731', 'IC50', 'umol.L-1');('CHEMBL4618732', 'IC50', 'umol.L-1');('CHEMBL4620624', 'IC50', 'umol.L-1');('CHEMBL4626292', 'IC50', 'umol.L-1');('CHEMBL4626322', 'IC50', 'umol.L-1');('CHEMBL4626323', 'IC50', 'umol.L-1');('CHEMBL4626324', 'IC50', 'umol.L-1');('CHEMBL4626325', 'IC50', 'umol.L-1');('CHEMBL4668639', 'IC50', 'umol.L-1');('CHEMBL4673753', 'IC50', 'umol.L-1');('CHEMBL4680623', 'IC50', 'umol.L-1');('CHEMBL4701666', 'IC50', 'umol.L-1');('CHEMBL4701667', 'IC50', 'umol.L-1');('CHEMBL4724285', 'IC50', 'umol.L-1');('CHEMBL4726089', 'IC50', 'umol.L-1');('CHEMBL4726091', 'IC50', 'umol.L-1');('CHEMBL4770222', 'IC50', 'umol.L-1');('CHEMBL4771034', 'IC50', 'umol.L-1');('CHEMBL4771263', 'IC50', 'umol.L-1');('CHEMBL4775328', 'IC50', 'umol.L-1');('CHEMBL4775330', 'IC50', 'umol.L-1');('CHEMBL5041649', 'IC50', 'umol.L-1');('CHEMBL5041650', 'IC50', 'umol.L-1');('CHEMBL5041652', 'IC50', 'umol.L-1');('CHEMBL5041653', 'IC50', 'umol.L-1');('CHEMBL5043465', 'IC50', 'umol.L-1');('CHEMBL5056049', 'IC50', 'umol.L-1');('CHEMBL5057947', 'IC50', 'umol.L-1');('CHEMBL5098075', 'IC50', 'umol.L-1');('CHEMBL5098079', 'IC50', 'umol.L-1');('CHEMBL5113636', 'IC50', 'umol.L-1');('CHEMBL5113644', 'IC50', 'umol.L-1');('CHEMBL5127350', 'IC50', 'umol.L-1');('CHEMBL5139649', 'IC50', 'umol.L-1');('CHEMBL5139671', 'IC50', 'umol.L-1');('CHEMBL5139675', 'IC50', 'umol.L-1');('CHEMBL5144258', 'IC50', 'umol.L-1');('CHEMBL5144260', 'IC50', 'umol.L-1');('CHEMBL5144261', 'IC50', 'umol.L-1');('CHEMBL5144262', 'IC50', 'umol.L-1');('CHEMBL5144263', 'IC50', 'umol.L-1');('CHEMBL5154107', 'IC50', 'umol.L-1');('CHEMBL5161730', 'IC50', 'umol.L-1');('CHEMBL5161731', 'IC50', 'umol.L-1');('CHEMBL5161732', 'IC50', 'umol.L-1');('CHEMBL5229090', 'IC50', 'umol.L-1');('CHEMBL5229091', 'IC50', 'umol.L-1');('CHEMBL5229096', 'IC50', 'umol.L-1');('CHEMBL5229097', 'IC50', 'umol.L-1');('CHEMBL5237926', 'IC50', 'umol.L-1');('CHEMBL5246226', 'IC50', 'umol.L-1');('CHEMBL5260790', 'IC50', 'umol.L-1');('CHEMBL5262989', 'IC50', 'umol.L-1');('CHEMBL5330238', 'IC50', 'umol.L-1');('CHEMBL5365560', 'IC50', 'umol.L-1');('CHEMBL5372641', 'IC50', 'umol.L-1');('CHEMBL5372642', 'IC50', 'umol.L-1');('CHEMBL5500454', 'IC50', 'umol.L-1');('CHEMBL5500457', 'IC50', 'umol.L-1');('CHEMBL5610229', 'IC50', 'umol.L-1');('CHEMBL5623086', 'IC50', 'umol.L-1');('CHEMBL5623087', 'IC50', 'umol.L-1');('CHEMBL5623728', 'IC50', 'umol.L-1');('CHEMBL5623737', 'IC50', 'umol.L-1');('CHEMBL648365', 'IC50', 'umol.L-1');('CHEMBL962127', 'IC50', 'umol.L-1');('CHEMBL963808', 'IC50', 'umol.L-1');('CHEMBL966201', 'IC50', 'umol.L-1');('CHEMBL989742', 'IC50', 'umol.L-1')"


In [None]:
merged_compounds = set(merged_compounds)
len(merged_compounds)

In [None]:
len([i for i in merged_compounds if i not in accepted_compounds])

In [None]:
len(merged_compounds.union(accepted_compounds))