In [None]:
from collections import defaultdict
import pandas as pd
import numpy as np
import zipfile
import gzip
import sys
import os

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", None)

In [2]:
def load_expert_cutoffs(CONFIGPATH):
    """
    Load expert cutoffs from the manual curation CSV and return them as a dictionary.

    The CSV is expected at:
        {CONFIGPATH}/expert_cutoffs.csv

    The returned dictionary maps:
        (activity_type, unit, target_type, pathogen_code) -> expert_cutoff

    Parameters
    ----------
    CONFIGPATH : str
        Path to the config folder.

    Returns
    -------
    dict
        Dictionary of expert cutoffs keyed by
        (activity_type, unit, target_type, pathogen_code).
    """
    # Load expert cut-offs
    EXPERT_CUTOFFS = pd.read_csv(os.path.join(CONFIGPATH, "expert_cutoffs.csv"))

    EXPERT_CUTOFFS = {
        (a, b, c, d): [float(k) for k in e.split(";")]
        for a, b, c, d, e in EXPERT_CUTOFFS[
            ["activity_type", "unit", "target_type", "pathogen_code", "expert_cutoff"]
        ].values
    }

    return EXPERT_CUTOFFS

def get_filtered_data(individual_LM_LABEL, assay_id, activity_type, unit):
    if type(unit) == str:
        df = individual_LM_LABEL[(individual_LM_LABEL['assay_id'] == assay_id) & 
                                 (individual_LM_LABEL['activity_type'] == activity_type) & 
                                 (individual_LM_LABEL['unit'] == unit)].reset_index(drop=True)
    else:
        df = individual_LM_LABEL[(individual_LM_LABEL['assay_id'] == assay_id) & 
                                 (individual_LM_LABEL['activity_type'] == activity_type) & 
                                 (individual_LM_LABEL['unit'].isna())].reset_index(drop=True)
    return df

def load_data_from_zip(zip_path, filename):
    """Load a gzipped CSV file from a ZIP archive into a pandas DataFrame.

    Parameters
    ----------
    zip_path : str
        Path to the ZIP archive.
    filename : str
        Name of the gzipped CSV file inside the ZIP.

    Returns
    -------
    pandas.DataFrame
        Loaded data.
    """
    with zipfile.ZipFile(zip_path) as z:
        with z.open(filename) as raw:
            with gzip.open(raw, mode="rt") as f:
                df = pd.read_csv(f)
    return df

def get_assay_data(ChEMBL_pathogen, assay_chembl_id, activity_type, unit, cols):
    """
    Extract assay activity data for a given assay_chembl_id, activity_type, and unit.

    If `unit` is a string, the function filters rows where `unit` matches exactly.
    Otherwise, it filters rows where `unit` is missing (NaN).

    Parameters
    ----------
    ChEMBL_pathogen : pandas.DataFrame
        DataFrame containing ChEMBL pathogen activity records.
    assay_chembl_id : str
        Assay ChEMBL ID to filter on.
    activity_type : str
        Activity type to filter on (e.g., IC50, MIC).
    unit : str or None
        Unit to filter on; if not a string, NaN units are selected.
    cols : list
        List of columns to return.

    Returns
    -------
    pandas.DataFrame
        Filtered assay activity data with only the requested columns.
    """
    if type(unit) == str:
        ASSAY_DATA = ChEMBL_pathogen[
            (ChEMBL_pathogen['assay_chembl_id'] == assay_chembl_id) &
            (ChEMBL_pathogen['activity_type'] == activity_type) &
            (ChEMBL_pathogen['unit'] == unit)
        ].reset_index(drop=True)[cols]
    else:
        ASSAY_DATA = ChEMBL_pathogen[
            (ChEMBL_pathogen['assay_chembl_id'] == assay_chembl_id) &
            (ChEMBL_pathogen['activity_type'] == activity_type) &
            (ChEMBL_pathogen['unit'].isna())
        ].reset_index(drop=True)[cols]

    return ASSAY_DATA


In [3]:
# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import DATAPATH, CONFIGPATH

# Load pathogen info
# pathogen_code = sys.argv[1]
pathogen_code = 'mtuberculosis'
df = pd.read_csv(os.path.join(CONFIGPATH, 'pathogens.csv'))
row = df.loc[df["code"].eq(pathogen_code)]
if row.empty: 
    raise SystemExit(f"Unknown code: {pathogen_code}")
pathogen = row.iloc[0]["pathogen"]

# Define output directory
OUTPUT = os.path.join(root, "..", "output")

# Load ChEMBL data for pathogen
ChEMBL_pathogen = pd.read_csv(os.path.join(OUTPUT, pathogen_code, f"{pathogen_code}_ChEMBL_cleaned_data.csv.gz"), low_memory=False)

# Get assay to index mapping
assay_to_idx = defaultdict(list)
for i, assay_id in enumerate(ChEMBL_pathogen["assay_chembl_id"].to_numpy()):
    assay_to_idx[assay_id].append(i)

# Load expert cut-offs
EXPERT_CUTOFFS = load_expert_cutoffs(CONFIGPATH)

# Load individual LM data
individual_LM = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "individual_LM.csv"))

In [4]:
individual_LM

Unnamed: 0,assay_id,activity_type,unit,target_type,target_type_curated_extra,cpds,direction,dataset_type,expert_cutoff,pos_qt,ratio_qt,cpds_qt,pos_ql,ratio_ql,cpds_ql,label,avg,std
0,CHEMBL4649948,PERCENTEFFECT,%,UNCHECKED,ORGANISM,86589,1.0,quantitative,25.0,4405.0,0.05087,86589.0,,,,A,0.725,0.006
1,CHEMBL4649948,PERCENTEFFECT,%,UNCHECKED,ORGANISM,86589,1.0,quantitative,50.0,1268.0,0.01464,86589.0,,,,A,0.706,0.006
2,CHEMBL4649948,PERCENTEFFECT,%,UNCHECKED,ORGANISM,86589,1.0,quantitative,75.0,361.0,0.00417,86589.0,,,,A,0.649,0.027
3,CHEMBL4649949,PERCENTEFFECT,%,UNCHECKED,ORGANISM,86575,1.0,quantitative,25.0,10324.0,0.11925,86575.0,,,,A,0.697,0.005
4,CHEMBL4649949,PERCENTEFFECT,%,UNCHECKED,ORGANISM,86575,1.0,quantitative,50.0,2181.0,0.02519,86575.0,,,,A,0.740,0.009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,CHEMBL1738104,CC50,umol.L-1,UNCHECKED,ORGANISM,350,-1.0,qualitative,,,,350.0,350.0,1.00000,350.0,D,0.869,0.018
94,CHEMBL2114860,AC50,umol.L-1,SINGLE PROTEIN,SINGLE PROTEIN,298,-1.0,mixed,,,,298.0,298.0,1.00000,298.0,D,0.885,0.023
95,CHEMBL2354217,IC50,umol.L-1,NO TARGET,DISCARDED,174,-1.0,qualitative,,,,174.0,138.0,0.79310,174.0,D,0.706,0.045
96,CHEMBL2354305,IC50,umol.L-1,SINGLE PROTEIN,SINGLE PROTEIN,174,-1.0,mixed,,,,174.0,149.0,0.85632,174.0,D,0.754,0.030


In [None]:
LABELS = ['A', 'B', 'C', 'D'][:1]
COLS_TO_KEEP = ["dataset_type", "pos_qt", "ratio_qt", "cpds_qt", "pos_ql", "ratio_ql", "cpds_ql"]
KEYS = ["assay_id", "activity_type", "unit", "target_type_curated_extra"]

SELECTED = []
ORIGINAL_COMPOUNDS = {i: set() for i in LABELS}
SELECTED_COMPOUNDS = {i: set() for i in LABELS}

for LABEL in LABELS:

    # Filter assays considered in label
    individual_LM_LABEL = individual_LM[individual_LM['label'] == LABEL]
    assays_LABEL = set([tuple(i) for i in individual_LM_LABEL[KEYS].values])

    for assay in assays_LABEL:

        # Get assay info
        assay_id, activity_type, unit, target_type = assay
        key = (assay_id, activity_type, unit)

        # # Get assay data
        # cols = ['compound_chembl_id', 'canonical_smiles', 'activity_type', 'value', 'relation', 'unit', 'text_flag']
        # assay_data = get_assay_data(tmp_df, assay_chembl_id, activity_type, unit, cols)

        # # If assay not considered previously
        # if key not in set([tuple([sel[1], sel[2], sel[3]]) for sel in SELECTED]):

#             if LABEL in ['A', 'C']:
#                 mid_cutoff = EXPERT_CUTOFFS[(activity_type, unit, target_type, pathogen_code)][1]
#             else:
#                 mid_cutoff = np.nan

#             # Filter results for that assay
#             df = get_filtered_data(individual_LM_LABEL, assay_id, activity_type, unit)

#             # Sort by average AUROC
#             df = df.sort_values(f"{LABEL}_AVG", ascending=False).reset_index(drop=True)

#             # Get best auroc and best cutoff
#             best_auroc = df[f"{LABEL}_AVG"].tolist()[0]
#             if LABEL in ['A', 'C']:
#                 best_cutoff = df["expert_cutoff"].tolist()[0]
#             else:
#                 best_cutoff = np.nan

#             # Get mid auroc (if available)
#             if LABEL in ['B', 'D'] or mid_cutoff not in set(df['expert_cutoff']):
#                 mid_auroc = np.nan
#             else:
#                 mid_auroc = df[df['expert_cutoff'] == mid_cutoff][f"{LABEL}_AVG"].tolist()[0]

#             # If the best dataset is modelable
#             if best_auroc > 0.7:

#                 # If difference is quite high, keep best
#                 if (np.isnan(mid_auroc)) or (best_auroc - mid_auroc > 0.1):
                
#                     INFO = df[COLS_TO_KEEP].values.tolist()[0]
#                     if LABEL in ['B', 'D']:
#                         INFO[1], INFO[2] = np.nan, np.nan
#                     SELECTED.append([LABEL, assay_id, activity_type, unit, best_cutoff, best_auroc] + INFO)
                
#                 # Otherwise, keep mid
#                 else:
                    
#                     INFO = df[df['expert_cutoff'] == mid_cutoff][COLS_TO_KEEP].values.tolist()[0]
#                     if LABEL in ['B', 'D']:
#                         INFO[1], INFO[2] = np.nan, np.nan
#                     SELECTED.append([LABEL, assay_id, activity_type, unit, mid_cutoff, mid_auroc] + INFO)

# # To pandas dataframe
# SELECTED = pd.DataFrame(SELECTED, columns=['label', 'assay_id', 'activity_type', 'unit', 'cutoff', 'AUROC', "dataset_type", 
#                                            "pos_qt", "ratio_qt", "cpds_qt", "pos_ql", "ratio_ql", "cpds_ql"])

# # Save 
# SELECTED.to_csv(os.path.join(OUTPUT, pathogen_code, "individual_selected_LM.csv"))

# # Check that only one dataset per assay is selected
# assert len(set([tuple(i) for i in SELECTED[["assay_id", "activity_type", "unit"]].values])) == len(SELECTED)

NameError: name 'tmp_df' is not defined

In [None]:
key = ("CHEMBL1614080", "IC90", "umol.L-1")

In [None]:
individual_LM_LABEL[(individual_LM_LABEL['assay_id'] == key[0]) & (individual_LM_LABEL['activity_type'] == key[1]) & (individual_LM_LABEL['unit'] == key[2])]

In [None]:
SELECTED[(SELECTED['assay_id'] == key[0]) & (SELECTED['activity_type'] == key[1]) & (SELECTED['unit'] == key[2])]

In [None]:
# Get all compounds for pathogen
compounds = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "compound_counts.csv.gz"))
compounds = set(compounds['compound_chembl_id'])

In [None]:
for LABEL in LABELS:

    if LABEL in ['A', 'C']:
        pass

    assays = 


    break