In [1]:
import pandas as pd
import sys
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
from IPython.display import display, HTML
from scipy.stats import spearmanr
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
import zipfile
import random
import gzip
import sys
import h5py
import os

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.width", 2000)
pd.set_option("display.max_colwidth", None)

display(HTML("""
<style>
.dataframe td, .dataframe th {
    white-space: nowrap !important;
}
</style>
"""))

In [2]:
import networkx as nx

In [3]:
# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import DATAPATH, CONFIGPATH

# Load pathogen info
# pathogen_code = sys.argv[1]
pathogen_code = 'mtuberculosis'
df = pd.read_csv(os.path.join(CONFIGPATH, 'pathogens.csv'))
row = df.loc[df["code"].eq(pathogen_code)]
if row.empty: 
    raise SystemExit(f"Unknown code: {pathogen_code}")
pathogen = row.iloc[0]["pathogen"]

# Create output directory
OUTPUT = os.path.join(root, "..", "output")

# Shared columns
KEYS = ["assay_id", "activity_type", "unit"]

# Columns to take from datasets table
COLUMNS_DATASETS = ["equal", 'higher', 'lower', "target_type_curated_extra", "dataset_type", "cpds_qt", "min_", "p1", "p25", "p50", "p75", "p99", "max_", "pos_ql", "ratio_ql", "cpds_ql"]

In [4]:
# Load assays info
ASSAYS_CLEANED = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_cleaned.csv"))
# ASSAYS_CLUSTERS = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_clusters.csv"))
ASSAYS_PARAMETERS = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_parameters.csv"))
ASSAYS_DATASETS_ = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_datasets.csv"))
INDIVIDUAL_LM = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "individual_LM.csv"))

# Get assay to quantitative data info
assay_to_qt_info = defaultdict(list)
for assay_id, activity_type, unit, expert_cutoff, ratio_qt in ASSAYS_DATASETS_[['assay_id', 'activity_type', 'unit', 'expert_cutoff', 'ratio_qt']].values:
    assay_to_qt_info[tuple([assay_id, activity_type, unit])].append([expert_cutoff, ratio_qt])

# Unique row per assay
ASSAYS_DATASETS = ASSAYS_DATASETS_[KEYS + COLUMNS_DATASETS].drop_duplicates().reset_index(drop=True)

# Get cutoffs and ratios
cutoffs = [";".join([str(j[0]) for j in assay_to_qt_info[tuple(i)]]) for i in ASSAYS_DATASETS[['assay_id', 'activity_type', 'unit']].values]
ratios = [";".join([str(j[1]) for j in assay_to_qt_info[tuple(i)]]) for i in ASSAYS_DATASETS[['assay_id', 'activity_type', 'unit']].values]
cutoffs = [i if i != 'nan' else np.nan for i in cutoffs]
ratios = [i if i != 'nan' else np.nan for i in ratios]

# Store results
ASSAYS_DATASETS.insert(8, 'cutoffs', cutoffs)
ASSAYS_DATASETS.insert(9, 'ratios', ratios)

# Merge everything
ASSAYS_MASTER = ASSAYS_CLEANED.merge(ASSAYS_PARAMETERS,on=KEYS, how="left", validate="1:1")
ASSAYS_MASTER = ASSAYS_MASTER.merge(ASSAYS_DATASETS,on=KEYS, how="left", validate="1:1")

In [5]:
# Dict mapping assay_id, activity_type and unit to a set of compound ChEMBL IDs
ChEMBL = pd.read_csv(os.path.join(OUTPUT, pathogen_code, f"{pathogen_code}_ChEMBL_cleaned_data.csv.gz"), low_memory=False)
ASSAY_TO_COMPOUNDS = defaultdict(set)
for assay_id, activity_type, unit, compound_chembl_id in ChEMBL[["assay_chembl_id", "activity_type", "unit", "compound_chembl_id"]].values:
    ASSAY_TO_COMPOUNDS[(assay_id, activity_type, unit)].add(compound_chembl_id)
del ChEMBL

In [6]:
def load_all_gz_csvs_from_zip(zip_path: str) -> dict[str, pd.DataFrame]:
    dfs = {}
    with zipfile.ZipFile(zip_path, "r") as z:
        for name in z.namelist():
            if name.endswith(".csv.gz"):
                with z.open(name) as f:
                    dfs[name] = pd.read_csv(f, compression="gzip")
    return dfs

datasets_dir = os.path.join(OUTPUT, pathogen_code, "datasets")

qt_zip = os.path.join(datasets_dir, "datasets_qt.zip")
ql_zip = os.path.join(datasets_dir, "datasets_ql.zip")

dfs_qt = load_all_gz_csvs_from_zip(qt_zip)
dfs_ql = load_all_gz_csvs_from_zip(ql_zip)

print("Loaded qt:", len(dfs_qt), "files")
print("Loaded ql:", len(dfs_ql), "files")

Loaded qt: 26370 files
Loaded ql: 1536 files


In [7]:
len(set([j for i in ASSAY_TO_COMPOUNDS for j in ASSAY_TO_COMPOUNDS[i]]))

137607

In [8]:
len(ASSAYS_CLEANED), len(ASSAYS_PARAMETERS), len(ASSAYS_DATASETS)

(10532, 10532, 10532)

In [9]:
ALL_COLS = ["assay_id", "assay_type", "assay_organism", "target_organism", "organism_curated", "doc_chembl_id", "target_type", "target_type_curated", "target_type_curated_extra", 
          "target_chembl_id", "target_chembl_id_curated", "target_name_curated", "bao_label", "source_label", "strain", "atcc_id", "mutations", "known_drug_resistances", "media",
          "activity_type", "unit", "activities", "nan_values", "cpds", "frac_cs", "direction", "act_flag", 'inact_flag', "equal", "higher", "lower", "dataset_type", "cutoffs", "ratios", 
          "cpds_qt", "pos_ql", "ratio_ql", "cpds_ql", "min_", "p1", "p25", "p50", "p75", "p99", "max_", 'Accepted', 'Considered']

In [10]:
def get_all_results_from_individual_modeling(LABELS):
    RESULTS, CONSIDERED_ASSAYS = {}, {}
    for LABEL in LABELS:
        RESULTS[LABEL] = {}
        CONSIDERED_ASSAYS[LABEL] = set()
        rows = INDIVIDUAL_LM[INDIVIDUAL_LM[LABEL]][["assay_id", "activity_type", "unit", "expert_cutoff", f"{LABEL}_AVG"]].values
        for assay_id, activity_type, unit, expert_cutoff, auroc in rows:
            key = (assay_id, activity_type, unit)
            CONSIDERED_ASSAYS[LABEL].add(key)
            if auroc > 0.7:
                if key not in RESULTS[LABEL]:
                    RESULTS[LABEL][key] = [expert_cutoff, auroc]
                elif auroc > RESULTS[LABEL][key][1]:
                    RESULTS[LABEL][key] = [expert_cutoff, auroc]
    return RESULTS, CONSIDERED_ASSAYS

def where_considered(key, LABELS, CONSIDERED_ASSAYS):
    considered = []
    for LABEL in LABELS:
        if key in CONSIDERED_ASSAYS[LABEL]:
            considered.append(LABEL)
    if len(considered) > 0:
        return ";".join(considered)
    else:
        return np.nan
    
def where_accepted(key, LABELS, ACCEPTED_ASSAYS):
    accepted = []
    for LABEL in LABELS:
        if key in ACCEPTED_ASSAYS[LABEL]:
            accepted.append(LABEL)
    if len(accepted) > 0:
        return ";".join(accepted)
    else:
        return np.nan

# Get results from individual modeling ABCD
LABELS = ['A', 'B', 'C', 'D']
ACCEPTED_ASSAYS, CONSIDERED_ASSAYS = get_all_results_from_individual_modeling(LABELS)

col_accepted, col_considered = [], []
for assay_id, activity_type, unit in ASSAYS_MASTER[["assay_id", "activity_type", "unit"]].values:
    # Get strategies in which this assay is considered and accepted
    key = tuple([assay_id, activity_type, unit])
    col_considered.append(where_considered(key, LABELS, CONSIDERED_ASSAYS))
    col_accepted.append(where_accepted(key, LABELS, ACCEPTED_ASSAYS))
ASSAYS_MASTER['Accepted'] = col_accepted
ASSAYS_MASTER['Considered'] = col_considered
ASSAYS_MASTER = ASSAYS_MASTER[ALL_COLS]

In [11]:
accepted_assays = ASSAYS_MASTER[(ASSAYS_MASTER['Accepted'].isna() == False)][['assay_id', 'activity_type', 'unit']].values
accepted_compounds = set([j for i in accepted_assays for j in ASSAY_TO_COMPOUNDS[tuple(i)]])

In [114]:
len(accepted_compounds)

97590

In [12]:
FILTERED_ASSAYS_MASTER = ASSAYS_MASTER[(ASSAYS_MASTER['Considered'].isna()) & (ASSAYS_MASTER['target_type_curated_extra'] == 'ORGANISM')].copy()

In [16]:
keys = ["activity_type", "unit", "target_type_curated_extra", "bao_label", "strain"]

cpds_grouped = (FILTERED_ASSAYS_MASTER
    .groupby(keys, dropna=False)
    .agg(n_cpds_red=("cpds", "sum"), n_assays=("assay_id", "size"))
    .reset_index()
    .sort_values("n_cpds_red", ascending=False)
)

cpds_grouped["cum_prop"] = cpds_grouped["n_cpds_red"].cumsum() / cpds_grouped["n_cpds_red"].sum()
cpds_grouped = cpds_grouped[cpds_grouped['n_cpds_red'] >= 1000]
cpds_grouped

Unnamed: 0,activity_type,unit,target_type_curated_extra,bao_label,strain,n_cpds_red,n_assays,cum_prop
655,MIC,umol.L-1,ORGANISM,organism-based format,H37Rv,25055,1948,0.321717
1209,MIC,umol.L-1,ORGANISM,organism-based format,,6228,917,0.401687
1329,MIC90,umol.L-1,ORGANISM,organism-based format,H37Rv,4766,284,0.462885
266,INHIBITION,%,ORGANISM,organism-based format,H37Rv,2857,314,0.49957
1420,MIC99,umol.L-1,ORGANISM,organism-based format,H37Rv,2510,134,0.531799
651,MIC,umol.L-1,ORGANISM,organism-based format,H37Ra,2323,218,0.561628
1383,MIC90,umol.L-1,ORGANISM,organism-based format,,1813,157,0.584907
164,GI,%,ORGANISM,organism-based format,H37Rv,1629,178,0.605824
221,IC50,umol.L-1,ORGANISM,organism-based format,H37Rv,1286,115,0.622337
284,INHIBITION,%,ORGANISM,organism-based format,,1269,67,0.638632


In [105]:
def get_filtered_assay_master(activity_type, unit, target_type_curated_extra, bao_label, strain):
    if type(unit) == str:
        df = FILTERED_ASSAYS_MASTER[(FILTERED_ASSAYS_MASTER['activity_type'] == activity_type) & 
                    (FILTERED_ASSAYS_MASTER['unit'] == unit) &
                    (FILTERED_ASSAYS_MASTER['target_type_curated_extra'] == target_type_curated_extra) &
                    (FILTERED_ASSAYS_MASTER['bao_label'] == bao_label) &
                    (FILTERED_ASSAYS_MASTER['strain'] == strain)]
    else:
        df = FILTERED_ASSAYS_MASTER[(FILTERED_ASSAYS_MASTER['activity_type'] == activity_type) & 
                    (FILTERED_ASSAYS_MASTER['unit'].isna()) &
                    (FILTERED_ASSAYS_MASTER['target_type_curated_extra'] == target_type_curated_extra) &
                    (FILTERED_ASSAYS_MASTER['bao_label'] == bao_label) &
                    (FILTERED_ASSAYS_MASTER['strain'] == strain)]
    return df


def load_expert_cutoffs(CONFIGPATH):
    """
    Load expert cutoffs from the manual curation CSV and return them as a dictionary.

    The CSV is expected at:
        {CONFIGPATH}/manual_curation/expert_cutoffs.csv

    The returned dictionary maps:
        (activity_type, unit, target_type, pathogen_code) -> expert_cutoff

    Parameters
    ----------
    CONFIGPATH : str
        Path to the config folder.

    Returns
    -------
    dict
        Dictionary of expert cutoffs keyed by
        (activity_type, unit, target_type, pathogen_code).
    """
    # Load expert cut-offs
    EXPERT_CUTOFFS = pd.read_csv(os.path.join(CONFIGPATH, "expert_cutoffs.csv"))

    EXPERT_CUTOFFS = {
        (a, b, c, d): [float(k) for k in e.split(";")]
        for a, b, c, d, e in EXPERT_CUTOFFS[
            ["activity_type", "unit", "target_type", "pathogen_code", "expert_cutoff"]
        ].values
    }

    return EXPERT_CUTOFFS

def load_ecfp_all(h5_path):
    """Load all ECFP (Morgan count) fingerprints.

    Parameters
    ----------
    h5_path : str
        Path to the HDF5 file containing datasets "SMILES" and "X_morgan".

    Returns
    -------
    dict[str, np.ndarray]
        Mapping {chembl_id: fingerprint (np.int8, shape (nBits,))}.
    """
    with h5py.File(h5_path, "r") as f:
        meta = f["SMILES"][:, 3].astype(str)
        fps  = f["X_morgan"][:]  # Load ALL

    return {cid: fp for cid, fp in zip(meta, fps)}

def KFoldTrain(X, Y, n_splits=4, n_estimators=100, random_state=42):
    """Stratified K-fold training/eval with RandomForest; returns mean AUROC and std.

    Parameters
    ----------
    X : np.ndarray
        Feature matrix (n_samples, n_features).
    Y : np.ndarray
        Binary labels (n_samples,).
    n_splits : int
        Number of folds.
    n_estimators : int
        Number of trees in the random forest.
    random_state : int
        RNG seed (also used for fold shuffling).

    Returns
    -------
    tuple[float, float]
        (mean_auroc, std_auroc) rounded to 3 decimals.
    """
    def init_RF():
        return RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features="sqrt",
            n_jobs=8,
            random_state=random_state,
        )

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    aurocs = []

    for train_idx, test_idx in skf.split(X, Y):
        X_train, X_test = X[train_idx], X[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]
        rf = init_RF()
        rf.fit(X_train, Y_train)
        y_prob = rf.predict_proba(X_test)[:, 1]
        aurocs.append(roc_auc_score(Y_test, y_prob))

    return round(float(np.mean(aurocs)), 3), round(float(np.std(aurocs)), 3)

def TrainRF(X, Y, n_estimators=100):
    """Train a RandomForestClassifier on all provided data and return the fitted model.

    Parameters
    ----------
    X : np.ndarray
        Feature matrix (n_samples, n_features).
    Y : np.ndarray
        Labels (n_samples,).

    Returns
    -------
    RandomForestClassifier
        Fitted classifier.
    """
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features="sqrt",
        n_jobs=8,
    )
    rf.fit(X, Y)
    return rf


In [104]:
# Load expert cut-offs
EXPERT_CUTOFFS = load_expert_cutoffs(CONFIGPATH)

# Loading Morgan fingerprints
PATH_TO_ECFPs = os.path.join(DATAPATH, "chembl_processed", "ChEMBL_ECFPs.h5")
ecfps = load_ecfp_all(PATH_TO_ECFPs)

In [136]:
merged_compounds = []

for merging in cpds_grouped.itertuples():

    # Get data
    activity_type = merging.activity_type
    unit = merging.unit
    target_type_curated_extra = merging.target_type_curated_extra
    bao_label = merging.bao_label
    strain = merging.strain

    # Filter master table
    df = get_filtered_assay_master(activity_type, unit, target_type_curated_extra, bao_label, strain)

    # Get quantitative and qualitative
    df_quant = df[(df['dataset_type'] == 'quantitative') | (df['dataset_type'] == 'mixed')].reset_index(drop=True)
    df_qual = df[(df['dataset_type'] == 'qualitative') | (df['dataset_type'] == 'mixed')].reset_index(drop=True)

    # Quantitative
    if len(df_quant) > 0 and sum(df_quant['cpds']) > 1000:

        # For each expert cut-off
        for expert_cutoff in EXPERT_CUTOFFS[(activity_type, unit, target_type_curated_extra, pathogen_code)]:
            print(expert_cutoff)
            assays = df_quant['assay_id'].tolist()
            files = [f"{i}_{activity_type}_{unit}_qt_{expert_cutoff}.csv.gz" for i in assays]
            data = [dfs_qt[f].assign(assay_id=a) for a, f in zip(assays, files)]
            data = pd.concat(data, ignore_index=True)
            data = data.sort_values("value", ascending=True).drop_duplicates("compound_chembl_id", keep="first").reset_index(drop=True)
            if len(data) > 1000:
                X = np.array(data['compound_chembl_id'].map(ecfps).to_list())
                Y = np.array(data['bin'].tolist())
                if sum(Y) > 50 and sum(Y) / len(Y):
                    print(f"Merging ... Activity type: {activity_type}, Unit: {unit}, Cutoff: {expert_cutoff}")
                    print(f"\tCompounds: {len(X)}", f"Positives: {sum(Y)} ({round(100 * sum(Y) / len(Y), 1)}%)")
                    # 4Fold Cros Validation
                    average_auroc, stds = KFoldTrain(X, Y, n_splits=5, n_estimators=100)
                    print(f"\tMean AUROC: {average_auroc} ± {stds}")
                    if average_auroc > 0.7:
                        merged_compounds.extend(data['compound_chembl_id'].tolist())

                    # # If performance is good enough, train on full data and predict on reference set
                    # if average_auroc > 0.7:
                    #     RF = TrainRF(X, Y, n_estimators=100)
                    #     y_prob_ref = RF.predict_proba(X_REF)[:, 1]
                    #     os.makedirs(os.path.join(PATH_TO_CORRELATIONS, LABEL), exist_ok=True)
                    #     np.savez_compressed(os.path.join(PATH_TO_CORRELATIONS, LABEL, filename.replace(".csv.gz", "_ref_probs.npz")), y_prob_ref=y_prob_ref)


    if len(df_qual) > 0 and sum(df_qual['cpds']) > 1000:

        # Get assays, files and data
        assays = df_qual['assay_id'].tolist()
        files = [f"{i}_{activity_type}_{unit}_ql.csv.gz" for i in assays]
        data = [dfs_ql[file] for file in files]
        data = pd.concat(data, ignore_index=True)
        data = data.sort_values("bin").drop_duplicates("compound_chembl_id", keep="first").reset_index(drop=True)
        if len(data) > 1000:
            ...
        else:
            print(f"Too few data for {activity_type}, {unit}, {target_type_curated_extra}, {bao_label}, {strain}... ({sum(df['cpds'])} --> {len(data)} compounds) after merging")

5.0
Merging ... Activity type: MIC, Unit: umol.L-1, Cutoff: 5.0
	Compounds: 18155 Positives: 5000 (27.5%)
	Mean AUROC: 0.898 ± 0.004
10.0
Merging ... Activity type: MIC, Unit: umol.L-1, Cutoff: 10.0
	Compounds: 18155 Positives: 6738 (37.1%)
	Mean AUROC: 0.889 ± 0.007
20.0
Merging ... Activity type: MIC, Unit: umol.L-1, Cutoff: 20.0
	Compounds: 18155 Positives: 8915 (49.1%)
	Mean AUROC: 0.89 ± 0.004
5.0
Merging ... Activity type: MIC90, Unit: umol.L-1, Cutoff: 5.0
	Compounds: 3216 Positives: 1391 (43.3%)
	Mean AUROC: 0.929 ± 0.009
10.0
Merging ... Activity type: MIC90, Unit: umol.L-1, Cutoff: 10.0
	Compounds: 3216 Positives: 1625 (50.5%)
	Mean AUROC: 0.938 ± 0.008
20.0
Merging ... Activity type: MIC90, Unit: umol.L-1, Cutoff: 20.0
	Compounds: 3216 Positives: 1943 (60.4%)
	Mean AUROC: 0.937 ± 0.007
25.0
Merging ... Activity type: INHIBITION, Unit: %, Cutoff: 25.0
	Compounds: 2176 Positives: 1620 (74.4%)
	Mean AUROC: 0.863 ± 0.017
50.0
Merging ... Activity type: INHIBITION, Unit: %, Cutof

In [137]:
merged_compounds = set(merged_compounds)
len(merged_compounds)

26134

In [138]:
len([i for i in merged_compounds if i not in accepted_compounds])

25903

In [144]:
len(merged_compounds.union(accepted_compounds))

123493

In [146]:
123493/ 137607

0.8974325434025885

In [None]:
cols = ["strain", "atcc_id", "media", "mutations", "known_drug_resistances"]
df[cols].isna().mean().sort_values(ascending=False)

In [None]:
strain_weight = 0.5
atcc_id_weight = 0.2
media_weight = 0.1
mutations_weight = 0.1
known_drug_resistances_weight = 0.1

# Normalize fields
df["strain"]  = df["strain"].fillna("").astype(str).str.strip().str.lower()
df["atcc_id"] = df["atcc_id"].fillna("").astype(str).str.strip().str.upper()
df["media"] = df["media"].fillna("").astype(str).str.strip().str.upper()
df["mutations"] = df["mutations"].fillna("").astype(str).str.strip().str.upper()
df["known_drug_resistances"] = df["known_drug_resistances"].fillna("").astype(str).str.strip().str.upper()

# Map node to info
df["_node"] = list(zip(df["assay_id"], df["activity_type"], df["unit"]))
node_to_atcc = dict(zip(df["_node"], df["atcc_id"]))
node_to_media = dict(zip(df["_node"], df["media"]))
node_to_mutations = dict(zip(df["_node"], df["mutations"]))
node_to_known_drug_resistances = dict(zip(df["_node"], df["known_drug_resistances"]))

def to_set(s):
    s = "" if pd.isna(s) else str(s).strip().upper()
    if s == "":
        return set()
    return {x.strip() for x in s.split(";") if x.strip()}

node_to_mutations = {n: to_set(m) for n, m in zip(df["_node"], df["mutations"])}
node_to_known_drug_resistances = {n: to_set(m) for n, m in zip(df["_node"], df["known_drug_resistances"])}

G = nx.Graph()

# Add nodes (assay_id, activity_type, unit)
for r in df.itertuples(index=False):
    G.add_node((r.assay_id, r.activity_type, r.unit))

# Add edges: connect assays with the same strain
for strain, sub in df[df["strain"] != ""].groupby("strain"):

    # Unique nodes in this strain
    nodes = list({(r.assay_id, r.activity_type, r.unit) for r in sub.itertuples(index=False)})

    # Connect all pairs having the same strain
    for i in range(len(nodes)):
        for j in range(i + 1, len(nodes)):
            G.add_edge(nodes[i], nodes[j], weight_strain=strain_weight)

print("Nodes:", G.number_of_nodes(), "Same strain edges:", G.number_of_edges())

# ATCC ID
n_with_atcc_id = (df["atcc_id"] != "").sum()
print("Nodes with ATCC ID:", n_with_atcc_id, "from", len(df), f"({100*n_with_atcc_id/len(df):.2f}%)")
boosted_atcc = 0
for u, v, data in G.edges(data=True):
    a = node_to_atcc[u]
    b = node_to_atcc[v]
    if a != "" and b != "" and a == b:
        data["weight_atcc_id"] = atcc_id_weight
        boosted_atcc += 1
    elif (a == "" and b != "") or (a != "" and b == ""):
        data["weight_atcc_id"] = atcc_id_weight / 2
        boosted_atcc += 1
    else:
        data["weight_atcc_id"] = 0
print(f"ATCC-boosted edges: {boosted_atcc} from {G.number_of_edges()} ({100*round(boosted_atcc/G.number_of_edges(), 4)}%)")

# MEDIA
n_with_media = (df["media"] != "").sum()
print("Nodes with media:", n_with_media, "from", len(df), f"({100*n_with_media/len(df):.2f}%)")
boosted_media = 0
for u, v, data in G.edges(data=True):
    a = node_to_media[u]
    b = node_to_media[v]
    if a != "" and b != "" and a == b:
        data["weight_media"] = media_weight
        boosted_media += 1
    # elif (a == "" and b != "") or (a != "" and b == ""):
    #     data["weight_media"] = media_weight / 2
    #     boosted_media += 1
    else:
        data["weight_media"] = 0
print(f"MEDIA-boosted edges: {boosted_media} from {G.number_of_edges()} ({100*boosted_media/G.number_of_edges():.2f}%)")


boosted_mut = 0
for u, v, data in G.edges(data=True):
    a = node_to_mutations[u]
    b = node_to_mutations[v]

    if len(a) > 0 and len(b) > 0:
        sim = len(a & b) / len(a | b)
        data["weight_mutations"] = mutations_weight * sim
        boosted_mut += 1 if sim > 0 else 0
    # elif len(a) == 0 and len(b) == 0:
    #     data["weight_mutations"] = mutations_weight / 2
    #     boosted_mut += 1
    else:
        data["weight_mutations"] = 0

print(f"MUTATIONS-boosted edges: {boosted_mut} from {G.number_of_edges()} ({100*boosted_mut/G.number_of_edges():.2f}%)")

boosted_kdr = 0
for u, v, data in G.edges(data=True):
    a = node_to_known_drug_resistances[u]
    b = node_to_known_drug_resistances[v]

    if len(a) > 0 and len(b) > 0:
        sim = len(a & b) / len(a | b) 
        data["weight_known_drug_resistances"] = known_drug_resistances_weight * sim
        boosted_kdr += 1 if sim > 0 else 0
    else:
        data["weight_known_drug_resistances"] = 0.0

print(f"KDR-boosted edges: {boosted_kdr} from {G.number_of_edges()} ({100*boosted_kdr/G.number_of_edges():.2f}%)")

In [None]:
# compute final weight for every edge
for _, _, data in G.edges(data=True):
    data["weight"] = (
        data["weight_strain"] +
        data["weight_atcc_id"] +
        data["weight_media"] +
        data["weight_mutations"] +
        data["weight_known_drug_resistances"]
    )


In [None]:
THR = 0.7

G_thr = nx.Graph((u, v, d) for u, v, d in G.edges(data=True) if d.get("weight", 0.0) >= THR)
components = [c for c in nx.connected_components(G_thr) if len(c) > 1]
components = sorted(components, key=len, reverse=True)

print("n_components:", len(components))
print("top sizes:", [len(c) for c in components])

In [None]:
def nunique_nonempty(x):
    x = pd.Series(x).dropna()
    x = x[x != ""]
    return x.nunique()

def is_consistent(group_df):
    return (
        nunique_nonempty(group_df["atcc_id"]) <= 1 and
        nunique_nonempty(group_df["media"])   <= 1
    )

In [None]:
consistent_groups = []
inconsistent_groups = []

for comp in components:
    sub = df[df["_node"].isin(comp)].copy()
    if is_consistent(sub):
        consistent_groups.append(sub)
    else:
        inconsistent_groups.append(sub)

print("consistent groups:", len(consistent_groups))
print("inconsistent groups:", len(inconsistent_groups))
print("top consistent sizes:", sorted([len(g) for g in consistent_groups], reverse=True)[:10])

In [None]:
# pick the biggest consistent group
g = consistent_groups[0]
assay_ids = g["assay_id"].unique()
print("group size:", len(g), "assays:", len(assay_ids))
print("ATCC non-empty uniques:", g.loc[g["atcc_id"]!="","atcc_id"].nunique())
print("media non-empty uniques:", g.loc[g["media"]!="","media"].nunique())


In [None]:
g