In [1]:
import pandas as pd
import numpy as np
import sys
import os

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", None)

In [2]:
# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import DATAPATH, CONFIGPATH

# Load pathogen info
# pathogen_code = sys.argv[1]
pathogen_code = 'mtuberculosis'
df = pd.read_csv(os.path.join(CONFIGPATH, 'pathogens.csv'))
row = df.loc[df["code"].eq(pathogen_code)]
if row.empty: 
    raise SystemExit(f"Unknown code: {pathogen_code}")
pathogen = row.iloc[0]["pathogen"]

# Define output directory
OUTPUT = os.path.join(root, "..", "output")

In [3]:
def load_expert_cutoffs(CONFIGPATH):
    """
    Load expert cutoffs from the manual curation CSV and return them as a dictionary.

    The CSV is expected at:
        {CONFIGPATH}/manual_curation/expert_cutoffs.csv

    The returned dictionary maps:
        (activity_type, unit, target_type, pathogen_code) -> expert_cutoff

    Parameters
    ----------
    CONFIGPATH : str
        Path to the config folder.

    Returns
    -------
    dict
        Dictionary of expert cutoffs keyed by
        (activity_type, unit, target_type, pathogen_code).
    """
    # Load expert cut-offs
    EXPERT_CUTOFFS = pd.read_csv(os.path.join(CONFIGPATH, "expert_cutoffs.csv"))

    EXPERT_CUTOFFS = {
        (a, b, c, d): [float(k) for k in e.split(";")]
        for a, b, c, d, e in EXPERT_CUTOFFS[
            ["activity_type", "unit", "target_type", "pathogen_code", "expert_cutoff"]
        ].values
    }

    return EXPERT_CUTOFFS

In [4]:
individual_LM = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "individual_LM.csv"))
LABELS = ['A', 'B', 'C', 'D']
COLS_TO_KEEP = ["dataset_type", "pos_qt", "ratio_qt", "cpds_qt", "pos_ql", "ratio_ql", "cpds_ql"]
KEYS = ["assay_id", "activity_type", "unit", "target_type_curated_extra"]
ACCEPTED = {i: dict() for i in LABELS}

# Load expert cut-offs
EXPERT_CUTOFFS = load_expert_cutoffs(CONFIGPATH)

In [6]:
for LABEL in LABELS:

    # Filter assays considered in label
    individual_LM_LABEL = individual_LM[individual_LM[LABEL]]
    assays_LABEL = set([tuple(i) for i in individual_LM_LABEL[KEYS].values])

    for assay in assays_LABEL:

        # Get assay info
        assay_id, activity_type, unit, target_type = assay
        key = (assay_id, activity_type, unit)

        # If assay not considered previously
        if key not in ACCEPTED['A'] and key not in ACCEPTED['B'] and key not in ACCEPTED['C'] and key not in ACCEPTED['D']:

            # Define middle cutoff
            mid_cutoff = EXPERT_CUTOFFS[(activity_type, unit, target_type, pathogen_code)][1]

            # Filter results for that assay
            if type(unit) == str:
                df = individual_LM_LABEL[(individual_LM_LABEL['assay_id'] == assay_id) & (individual_LM_LABEL['activity_type'] == activity_type) & (individual_LM_LABEL['unit'] == unit)]
            else:
                df = individual_LM_LABEL[(individual_LM_LABEL['assay_id'] == assay_id) & (individual_LM_LABEL['activity_type'] == activity_type) & (individual_LM_LABEL['unit'].isna())]

            # Sort by average AUROC
            df = df.sort_values(f"{LABEL}_AVG", ascending=False).reset_index(drop=True)

            # Get best auroc and best cutoff
            best_auroc = df[f"{LABEL}_AVG"].tolist()[0]
            best_cutoff = df["expert_cutoff"].tolist()[0]

            # Get mid auroc (if available)
            if mid_cutoff in set(df['expert_cutoff']):
                mid_auroc = df[df['expert_cutoff'] == mid_cutoff][f"{LABEL}_AVG"].tolist()[0]
            else:
                mid_auroc = np.nan

            # If the best dataset is modelable
            if best_auroc > 0.7:

                # If difference is quite high, keep best
                if (np.isnan(mid_auroc)) or (best_auroc - mid_auroc > 0.1):
                
                    INFO = df[COLS_TO_KEEP].values.tolist()[0]
                    ACCEPTED[LABEL][(assay_id, activity_type, unit)] = [best_cutoff, best_auroc] + INFO
                
                # Otherwise, keep mid
                else:
                    
                    INFO = df[df['expert_cutoff'] == mid_cutoff][COLS_TO_KEEP].values.tolist()[0]
                    ACCEPTED[LABEL][(assay_id, activity_type, unit)] = [mid_cutoff, mid_auroc] + INFO


    break