In [1]:
import pandas as pd
import sys
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
from IPython.display import display, HTML
from scipy.stats import spearmanr
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
import zipfile
import random
import gzip
import sys
import h5py
import os

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.width", 2000)
pd.set_option("display.max_colwidth", None)

display(HTML("""
<style>
.dataframe td, .dataframe th {
    white-space: nowrap !important;
}
</style>
"""))


In [2]:
import networkx as nx

In [3]:
# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import DATAPATH, CONFIGPATH

# Load pathogen info
# pathogen_code = sys.argv[1]
pathogen_code = 'mtuberculosis'
df = pd.read_csv(os.path.join(CONFIGPATH, 'pathogens.csv'))
row = df.loc[df["code"].eq(pathogen_code)]
if row.empty: 
    raise SystemExit(f"Unknown code: {pathogen_code}")
pathogen = row.iloc[0]["pathogen"]

# Create output directory
OUTPUT = os.path.join(root, "..", "output")

# Shared columns
KEYS = ["assay_id", "activity_type", "unit"]

# Columns to take from each table
COLUMNS_CLEANED = ["assay_id", "assay_type", "assay_organism", "doc_chembl_id", "target_type", "target_chembl_id", "target_organism", "activity_type", 
                "unit", "canonical_unit", "activities", "nan_values", "cpds", "direction", "activity_comment_counts", "standard_text_count"]
# COLUMNS_CLUSTERS = ['clusters_0.3', 'clusters_0.6', 'clusters_0.85']
COLUMNS_DATASETS = ["equal", 'higher', 'lower', "target_type_curated_extra", "dataset_type", "cpds_qt", "min_", "p1", "p25", "p50", "p75", "p99", "max_", "pos_ql", "ratio_ql", "cpds_ql"]

In [4]:
# Load assays info
ASSAYS_CLEANED = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_cleaned.csv"))
# ASSAYS_CLUSTERS = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_clusters.csv"))
ASSAYS_PARAMETERS = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_parameters.csv"))
ASSAYS_DATASETS_ = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_datasets.csv"))
INDIVIDUAL_LM = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "individual_LM.csv"))

# Get assay to quantitative data info
assay_to_qt_info = defaultdict(list)
for assay_id, activity_type, unit, expert_cutoff, ratio_qt in ASSAYS_DATASETS_[['assay_id', 'activity_type', 'unit', 'expert_cutoff', 'ratio_qt']].values:
    assay_to_qt_info[tuple([assay_id, activity_type, unit])].append([expert_cutoff, ratio_qt])

# Unique row per assay
ASSAYS_DATASETS = ASSAYS_DATASETS_[KEYS + COLUMNS_DATASETS].drop_duplicates().reset_index(drop=True)

# Get cutoffs and ratios
cutoffs = [";".join([str(j[0]) for j in assay_to_qt_info[tuple(i)]]) for i in ASSAYS_DATASETS[['assay_id', 'activity_type', 'unit']].values]
ratios = [";".join([str(j[1]) for j in assay_to_qt_info[tuple(i)]]) for i in ASSAYS_DATASETS[['assay_id', 'activity_type', 'unit']].values]
cutoffs = [i if i != 'nan' else np.nan for i in cutoffs]
ratios = [i if i != 'nan' else np.nan for i in ratios]

# Store results
ASSAYS_DATASETS.insert(8, 'cutoffs', cutoffs)
ASSAYS_DATASETS.insert(9, 'ratios', ratios)

# Merge everything
ASSAYS_MASTER = ASSAYS_CLEANED.merge(ASSAYS_PARAMETERS,on=KEYS, how="left", validate="1:1")
ASSAYS_MASTER = ASSAYS_MASTER.merge(ASSAYS_DATASETS,on=KEYS, how="left", validate="1:1")

In [5]:
len(ASSAYS_CLEANED), len(ASSAYS_PARAMETERS), len(ASSAYS_DATASETS)

(10532, 10532, 10532)

In [6]:
ALL_COLS = ["assay_id", "assay_type", "assay_organism", "target_organism", "organism_curated", "doc_chembl_id", "target_type", "target_type_curated", "target_type_curated_extra", 
          "target_chembl_id", "target_chembl_id_curated", "target_name_curated", "bao_label", "source_label", "strain", "atcc_id", "mutations", "known_drug_resistances", "media",
          "activity_type", "unit", "activities", "nan_values", "cpds", "frac_cs", "direction", "act_flag", 'inact_flag', "equal", "higher", "lower", "dataset_type", "cutoffs", "ratios", 
          "cpds_qt", "pos_ql", "ratio_ql", "cpds_ql", "min_", "p1", "p25", "p50", "p75", "p99", "max_", 'Accepted', 'Considered']

In [7]:
def get_all_results_from_individual_modeling(LABELS):
    RESULTS, CONSIDERED_ASSAYS = {}, {}
    for LABEL in LABELS:
        RESULTS[LABEL] = {}
        CONSIDERED_ASSAYS[LABEL] = set()
        rows = INDIVIDUAL_LM[INDIVIDUAL_LM[LABEL]][["assay_id", "activity_type", "unit", "expert_cutoff", f"{LABEL}_AVG"]].values
        for assay_id, activity_type, unit, expert_cutoff, auroc in rows:
            key = (assay_id, activity_type, unit)
            CONSIDERED_ASSAYS[LABEL].add(key)
            if auroc > 0.7:
                if key not in RESULTS[LABEL]:
                    RESULTS[LABEL][key] = [expert_cutoff, auroc]
                elif auroc > RESULTS[LABEL][key][1]:
                    RESULTS[LABEL][key] = [expert_cutoff, auroc]
    return RESULTS, CONSIDERED_ASSAYS

def where_considered(key, LABELS, CONSIDERED_ASSAYS):
    considered = []
    for LABEL in LABELS:
        if key in CONSIDERED_ASSAYS[LABEL]:
            considered.append(LABEL)
    if len(considered) > 0:
        return ";".join(considered)
    else:
        return np.nan
    
def where_accepted(key, LABELS, ACCEPTED_ASSAYS):
    accepted = []
    for LABEL in LABELS:
        if key in ACCEPTED_ASSAYS[LABEL]:
            accepted.append(LABEL)
    if len(accepted) > 0:
        return ";".join(accepted)
    else:
        return np.nan

# Get results from individual modeling ABCD
LABELS = ['A', 'B', 'C', 'D']
ACCEPTED_ASSAYS, CONSIDERED_ASSAYS = get_all_results_from_individual_modeling(LABELS)

col_accepted, col_considered = [], []
for assay_id, activity_type, unit in ASSAYS_MASTER[["assay_id", "activity_type", "unit"]].values:
    # Get strategies in which this assay is considered and accepted
    key = tuple([assay_id, activity_type, unit])
    col_considered.append(where_considered(key, LABELS, CONSIDERED_ASSAYS))
    col_accepted.append(where_accepted(key, LABELS, ACCEPTED_ASSAYS))
ASSAYS_MASTER['Accepted'] = col_accepted
ASSAYS_MASTER['Considered'] = col_considered

In [8]:
ASSAYS_MASTER = ASSAYS_MASTER[ALL_COLS]

In [9]:
ASSAYS_MASTER[(ASSAYS_MASTER['Accepted'].isna()) & (ASSAYS_MASTER['target_type_curated_extra'] == 'ORGANISM')][:10]

Unnamed: 0,assay_id,assay_type,assay_organism,target_organism,organism_curated,doc_chembl_id,target_type,target_type_curated,target_type_curated_extra,target_chembl_id,target_chembl_id_curated,target_name_curated,bao_label,source_label,strain,atcc_id,mutations,known_drug_resistances,media,activity_type,unit,activities,nan_values,cpds,frac_cs,direction,act_flag,inact_flag,equal,higher,lower,dataset_type,cutoffs,ratios,cpds_qt,pos_ql,ratio_ql,cpds_ql,min_,p1,p25,p50,p75,p99,max_,Accepted,Considered
27,CHEMBL2094261,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL2094260,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,MMV_MBOX,,,,,,INHIBITION,%,399,0,399,0.00288,1.0,0,0,399,0,0,quantitative,25.0;50.0;75.0,0.128;0.028;0.018,399.0,,,,-14.0,-6.706,6.81,12.0,18.85,99.804,102.0,,
28,CHEMBL2094262,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL2094260,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,MMV_MBOX,,,,,,INHIBITION,%,399,0,399,0.00288,1.0,0,0,399,0,0,quantitative,25.0;50.0;75.0,0.286;0.165;0.113,399.0,,,,-15.4,-4.78,7.625,14.8,31.25,98.812,101.0,,
34,CHEMBL4649971,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL3988442,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,GATES_LIBRARY,,,,,,IC50,umol.L-1,269,0,200,0.00144,-1.0,0,0,165,104,0,quantitative,5.0;10.0;20.0,0.055;0.12;0.23,200.0,,,,1.315,1.757,21.996,49.864,100.0,100.0,100.0,,
38,CHEMBL5345966,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL5344439,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,LITERATURE,H37Rv,,,,Sulas semisynthetic medium,MIC99,umol.L-1,172,0,172,0.00124,-1.0,0,0,135,36,1,quantitative,5.0;10.0;20.0,0.25;0.297;0.39,172.0,,,,0.03,0.106,7.0,64.0,250.0,250.0,250.0,,
39,CHEMBL5345967,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL5344439,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,LITERATURE,H37Rv,,,,Sulas semisynthetic medium,MIC99,umol.L-1,172,0,172,0.00124,-1.0,0,0,134,37,1,quantitative,5.0;10.0;20.0,0.221;0.262;0.355,172.0,,,,0.03,0.106,8.0,125.0,250.0,250.0,250.0,,
42,CHEMBL4388638,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL4387722,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,LITERATURE,H37Rv,,,,,INHIBITION,%,146,0,146,0.00105,1.0,0,0,146,0,0,quantitative,25.0;50.0;75.0,0.493;0.336;0.151,146.0,,,,0.0,0.0,0.0,23.0,60.5,92.65,95.0,,
43,CHEMBL2032580,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL2029208,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,LITERATURE,H37Ra,,,,,MIC,umol.L-1,136,0,136,0.00098,-1.0,0,0,95,41,0,quantitative,5.0;10.0;20.0,0.169;0.331;0.426,136.0,,,,0.031,0.281,6.254,49.113,207.938,207.938,207.938,,
49,CHEMBL747348,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL1122718,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,LITERATURE,,,,,,MIC,umol.L-1,118,0,118,0.00085,-1.0,0,0,58,60,0,quantitative,5.0;10.0;20.0,0.0;0.0;0.0,118.0,,,,32.121,55.683,302.589,1866.134,1866.134,1866.134,1866.134,,
50,CHEMBL3832767,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL3832761,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,MMV_PBOX,,,,,7H9/ADC/Tw,MIC,umol.L-1,116,0,116,0.00084,-1.0,0,0,107,8,1,quantitative,5.0;10.0;20.0,0.336;0.483;0.707,116.0,,,,0.02,0.02,2.3,12.5,25.0,50.0,50.0,,
53,CHEMBL3832764,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL3832761,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,MMV_PBOX,,,,,,MIC,umol.L-1,116,0,116,0.00084,-1.0,0,0,110,1,5,quantitative,5.0;10.0;20.0,0.724;0.802;0.914,116.0,,,,0.02,0.02,0.6,1.56,9.4,48.05,50.0,,C


In [10]:
cpds_by_pair = (ASSAYS_MASTER[(ASSAYS_MASTER['Considered'].isna()) & (ASSAYS_MASTER['target_type_curated_extra'] == 'ORGANISM')]
    .groupby(["activity_type", "unit"], dropna=False)["cpds"]
    .sum()
    .sort_values(ascending=False)
    .reset_index(name="cpds_sum")
)

total = cpds_by_pair["cpds_sum"].sum()
cpds_by_pair["cumulative_proportion"] = (cpds_by_pair["cpds_sum"].cumsum() / total)
cpds_by_pair[:5]

Unnamed: 0,activity_type,unit,cpds_sum,cumulative_proportion
0,MIC,umol.L-1,44664,0.573505
1,MIC90,umol.L-1,9418,0.694436
2,INHIBITION,%,5482,0.764827
3,MIC99,umol.L-1,4520,0.822866
4,GI,%,2971,0.861015


In [11]:
df = ASSAYS_MASTER[
    (ASSAYS_MASTER["Considered"].isna()) &
    (ASSAYS_MASTER["target_type_curated_extra"] == "ORGANISM") &
    (ASSAYS_MASTER["activity_type"] == "MIC") &
    (ASSAYS_MASTER["unit"] == "umol.L-1")
].copy()

In [12]:
df

Unnamed: 0,assay_id,assay_type,assay_organism,target_organism,organism_curated,doc_chembl_id,target_type,target_type_curated,target_type_curated_extra,target_chembl_id,target_chembl_id_curated,target_name_curated,bao_label,source_label,strain,atcc_id,mutations,known_drug_resistances,media,activity_type,unit,activities,nan_values,cpds,frac_cs,direction,act_flag,inact_flag,equal,higher,lower,dataset_type,cutoffs,ratios,cpds_qt,pos_ql,ratio_ql,cpds_ql,min_,p1,p25,p50,p75,p99,max_,Accepted,Considered
43,CHEMBL2032580,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL2029208,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,LITERATURE,H37Ra,,,,,MIC,umol.L-1,136,0,136,0.00098,-1.0,0,0,95,41,0,quantitative,5.0;10.0;20.0,0.169;0.331;0.426,136.0,,,,0.031,0.281,6.254,49.113,207.938,207.938,207.938,,
49,CHEMBL747348,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL1122718,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,LITERATURE,,,,,,MIC,umol.L-1,118,0,118,0.00085,-1.0,0,0,58,60,0,quantitative,5.0;10.0;20.0,0.0;0.0;0.0,118.0,,,,32.121,55.683,302.589,1866.134,1866.134,1866.134,1866.134,,
50,CHEMBL3832767,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL3832761,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,MMV_PBOX,,,,,7H9/ADC/Tw,MIC,umol.L-1,116,0,116,0.00084,-1.0,0,0,107,8,1,quantitative,5.0;10.0;20.0,0.336;0.483;0.707,116.0,,,,0.020,0.020,2.300,12.500,25.000,50.000,50.000,,
54,CHEMBL3832765,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL3832761,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,MMV_PBOX,,,,,7H9/ADC/Tw,MIC,umol.L-1,116,0,116,0.00084,-1.0,0,0,86,28,2,quantitative,5.0;10.0;20.0,0.319;0.362;0.509,116.0,,,,0.020,0.025,3.130,19.000,50.000,50.000,50.000,,
55,CHEMBL3832766,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL3832761,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,MMV_PBOX,,,,,7H9/ADC/Tw,MIC,umol.L-1,116,0,116,0.00084,-1.0,0,0,79,36,1,quantitative,5.0;10.0;20.0,0.259;0.362;0.414,116.0,,,,0.020,0.028,4.700,25.000,50.000,50.000,50.000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10500,CHEMBL5229084,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL5226349,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,LITERATURE,,,,,,MIC,umol.L-1,1,0,1,0.00001,-1.0,0,0,1,0,0,quantitative,5.0;10.0;20.0,1.0;1.0;1.0,1.0,,,,2.510,2.510,2.510,2.510,2.510,2.510,2.510,,
10504,CHEMBL5226839,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL5226287,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,LITERATURE,H37Rv,,,,,MIC,umol.L-1,1,0,1,0.00001,-1.0,0,0,1,0,0,quantitative,5.0;10.0;20.0,1.0;1.0;1.0,1.0,,,,0.102,0.102,0.102,0.102,0.102,0.102,0.102,,
10507,CHEMBL5226809,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL5226287,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,LITERATURE,,,,,,MIC,umol.L-1,1,0,1,0.00001,-1.0,0,0,0,0,1,quantitative,5.0;10.0;20.0,1.0;1.0;1.0,1.0,,,,0.024,0.024,0.024,0.024,0.024,0.024,0.024,,
10515,CHEMBL5226819,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL5226287,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,,organism-based format,LITERATURE,H37Rv,,,isoniazid,CLSI broth,MIC,umol.L-1,1,0,1,0.00001,-1.0,0,0,1,0,0,quantitative,5.0;10.0;20.0,1.0;1.0;1.0,1.0,,,,0.026,0.026,0.026,0.026,0.026,0.026,0.026,,


In [13]:
cols = ["strain", "atcc_id", "media", "mutations", "known_drug_resistances"]
df[cols].isna().mean().sort_values(ascending=False)

mutations                 0.913521
media                     0.892990
atcc_id                   0.838656
known_drug_resistances    0.736416
strain                    0.197014
dtype: float64

In [16]:
df

Unnamed: 0,assay_id,assay_type,assay_organism,target_organism,organism_curated,doc_chembl_id,target_type,target_type_curated,target_type_curated_extra,target_chembl_id,target_chembl_id_curated,target_name_curated,bao_label,source_label,strain,atcc_id,mutations,known_drug_resistances,media,activity_type,unit,activities,nan_values,cpds,frac_cs,direction,act_flag,inact_flag,equal,higher,lower,dataset_type,cutoffs,ratios,cpds_qt,pos_ql,ratio_ql,cpds_ql,min_,p1,p25,p50,p75,p99,max_,Accepted,Considered
43,CHEMBL2032580,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL2029208,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,LITERATURE,H37Ra,,,,,MIC,umol.L-1,136,0,136,0.00098,-1.0,0,0,95,41,0,quantitative,5.0;10.0;20.0,0.169;0.331;0.426,136.0,,,,0.031,0.281,6.254,49.113,207.938,207.938,207.938,,
49,CHEMBL747348,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL1122718,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,LITERATURE,,,,,,MIC,umol.L-1,118,0,118,0.00085,-1.0,0,0,58,60,0,quantitative,5.0;10.0;20.0,0.0;0.0;0.0,118.0,,,,32.121,55.683,302.589,1866.134,1866.134,1866.134,1866.134,,
50,CHEMBL3832767,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL3832761,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,MMV_PBOX,,,,,7H9/ADC/Tw,MIC,umol.L-1,116,0,116,0.00084,-1.0,0,0,107,8,1,quantitative,5.0;10.0;20.0,0.336;0.483;0.707,116.0,,,,0.020,0.020,2.300,12.500,25.000,50.000,50.000,,
54,CHEMBL3832765,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL3832761,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,MMV_PBOX,,,,,7H9/ADC/Tw,MIC,umol.L-1,116,0,116,0.00084,-1.0,0,0,86,28,2,quantitative,5.0;10.0;20.0,0.319;0.362;0.509,116.0,,,,0.020,0.025,3.130,19.000,50.000,50.000,50.000,,
55,CHEMBL3832766,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL3832761,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,MMV_PBOX,,,,,7H9/ADC/Tw,MIC,umol.L-1,116,0,116,0.00084,-1.0,0,0,79,36,1,quantitative,5.0;10.0;20.0,0.259;0.362;0.414,116.0,,,,0.020,0.028,4.700,25.000,50.000,50.000,50.000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10500,CHEMBL5229084,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL5226349,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,LITERATURE,,,,,,MIC,umol.L-1,1,0,1,0.00001,-1.0,0,0,1,0,0,quantitative,5.0;10.0;20.0,1.0;1.0;1.0,1.0,,,,2.510,2.510,2.510,2.510,2.510,2.510,2.510,,
10504,CHEMBL5226839,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL5226287,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,LITERATURE,H37Rv,,,,,MIC,umol.L-1,1,0,1,0.00001,-1.0,0,0,1,0,0,quantitative,5.0;10.0;20.0,1.0;1.0;1.0,1.0,,,,0.102,0.102,0.102,0.102,0.102,0.102,0.102,,
10507,CHEMBL5226809,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL5226287,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,Mycobacterium tuberculosis,organism-based format,LITERATURE,,,,,,MIC,umol.L-1,1,0,1,0.00001,-1.0,0,0,0,0,1,quantitative,5.0;10.0;20.0,1.0;1.0;1.0,1.0,,,,0.024,0.024,0.024,0.024,0.024,0.024,0.024,,
10515,CHEMBL5226819,F,Mycobacterium tuberculosis,Mycobacterium tuberculosis,Mycobacterium tuberculosis,CHEMBL5226287,ORGANISM,ORGANISM,ORGANISM,CHEMBL360,,,organism-based format,LITERATURE,H37Rv,,,isoniazid,CLSI broth,MIC,umol.L-1,1,0,1,0.00001,-1.0,0,0,1,0,0,quantitative,5.0;10.0;20.0,1.0;1.0;1.0,1.0,,,,0.026,0.026,0.026,0.026,0.026,0.026,0.026,,


In [21]:
df["strain"] = df["strain"].fillna("").astype(str).str.strip()

# Init graph
G = nx.Graph()

# Add nodes
for r in df.itertuples(index=False):
    G.add_node((r.assay_id, r.activity_type, r.unit))

# Add edges
for strain, sub in df[df["strain"] != ""].groupby("strain"):
    nodes = sub.apply(lambda x: (x["assay_id"], x["activity_type"], x["unit"]), axis=1).unique().tolist()
    for i in range(len(nodes)):
        for j in range(i + 1, len(nodes)):
            G.add_edge(nodes[i], nodes[j], weight=1.0, reason=f"same_strain:{strain}")

print("nodes:", G.number_of_nodes(), "edges:", G.number_of_edges())

nodes: 4822 edges: 2095396


In [30]:
for i,j in df[df["strain"] != ""].groupby('strain'):
    print(j)
    break

           assay_id assay_type              assay_organism             target_organism            organism_curated  doc_chembl_id target_type target_type_curated target_type_curated_extra target_chembl_id target_chembl_id_curated         target_name_curated              bao_label source_label   strain atcc_id   mutations               known_drug_resistances media activity_type      unit  activities  nan_values  cpds  frac_cs  direction  act_flag  inact_flag  equal  higher  lower  dataset_type        cutoffs           ratios  cpds_qt  pos_ql  ratio_ql  cpds_ql   min_     p1    p25    p50  p75     p99    max_ Accepted Considered
5665  CHEMBL1654669          F  Mycobacterium tuberculosis  Mycobacterium tuberculosis  Mycobacterium tuberculosis  CHEMBL1649247    ORGANISM            ORGANISM                  ORGANISM        CHEMBL360                      NaN  Mycobacterium tuberculosis  organism-based format   LITERATURE  00-0715     NaN  A90V;N533T  ofloxacin;moxifloxacin;gatifloxacin  7H11