In [1]:
from IPython.display import display, HTML
import pandas as pd
import numpy as np
import sys
import os
from collections import defaultdict

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.width", 2000)
pd.set_option("display.max_colwidth", None)

display(HTML("""
<style>
.dataframe td, .dataframe th {
    white-space: nowrap !important;
}
</style>
"""))

In [4]:
# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import DATAPATH, CONFIGPATH

# Load pathogen info
# pathogen_code = sys.argv[1]
pathogen_code = 'mtuberculosis'
df = pd.read_csv(os.path.join(CONFIGPATH, 'pathogens.csv'))
row = df.loc[df["code"].eq(pathogen_code)]
if row.empty: 
    raise SystemExit(f"Unknown code: {pathogen_code}")
pathogen = row.iloc[0]["pathogen"]

# Create output directory
OUTPUT = os.path.join(root, "..", "output")

# Shared columns
KEYS = ["assay_id", "activity_type", "unit"]

# Columns to take from each table
COLUMNS_CLEANED = ["assay_id", "assay_type", "assay_organism", "doc_chembl_id", "target_type", "target_chembl_id", "target_organism", "activity_type", 
                "unit", "canonical_unit", "activities", "nan_values", "cpds", "direction", "activity_comment_counts", "standard_text_count"]
COLUMNS_CLUSTERS = ['clusters_0.3', 'clusters_0.6', 'clusters_0.85']
COLUMNS_DATASETS = ["equal", 'higher', 'lower', "dataset_type", "cpds_qt", "min_", "p1", "p25", "p50", "p75", "p99", "max_", "pos_ql", "ratio_ql", "cpds_ql"]

In [8]:
# Load assays info
ASSAYS_CLEANED = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_cleaned.csv"))
ASSAYS_CLUSTERS = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_clusters.csv"))
ASSAYS_PARAMETERS = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_parameters.csv"))
ASSAYS_DATASETS = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_datasets.csv"))

# Get assay to quantitative data info
assay_to_qt_info = defaultdict(list)
for assay_id, activity_type, unit, expert_cutoff, ratio_qt in ASSAYS_DATASETS[['assay_id', 'activity_type', 'unit', 'expert_cutoff', 'ratio_qt']].values:
    assay_to_qt_info[tuple([assay_id, activity_type, unit])].append([expert_cutoff, ratio_qt])

# Unique row per assay
ASSAYS_DATASETS = ASSAYS_DATASETS[KEYS + COLUMNS_DATASETS].drop_duplicates().reset_index(drop=True)

# Get cutoffs and ratios
cutoffs = [";".join([str(j[0]) for j in assay_to_qt_info[tuple(i)]]) for i in ASSAYS_DATASETS[['assay_id', 'activity_type', 'unit']].values]
ratios = [";".join([str(j[1]) for j in assay_to_qt_info[tuple(i)]]) for i in ASSAYS_DATASETS[['assay_id', 'activity_type', 'unit']].values]

# Store results
ASSAYS_DATASETS.insert(8, 'cutoffs', cutoffs)
ASSAYS_DATASETS.insert(9, 'ratios', ratios)

In [9]:
len(ASSAYS_CLEANED), len(ASSAYS_CLUSTERS), len(ASSAYS_PARAMETERS), len(ASSAYS_DATASETS)

(10532, 10532, 10267, 10532)

In [10]:
ASSAYS_DATASETS

Unnamed: 0,assay_id,activity_type,unit,equal,higher,lower,dataset_type,cpds_qt,cutoffs,ratios,min_,p1,p25,p50,p75,p99,max_,pos_ql,ratio_ql,cpds_ql
0,CHEMBL4649948,PERCENTEFFECT,%,93555,0,0,quantitative,86589.0,25.0;50.0;75.0,0.051;0.015;0.004,-1122.89,-39.791,-10.300,-1.066,7.879,58.950,120.27,,,
1,CHEMBL4649949,PERCENTEFFECT,%,101515,0,0,quantitative,86575.0,25.0;50.0;75.0,0.119;0.025;0.005,-1111.40,-46.433,-6.945,3.342,14.630,66.173,133.09,,,
2,CHEMBL4649971,PERCENTEFFECT,%,68619,0,0,quantitative,68613.0,25.0;50.0;75.0,0.046;0.014;0.008,-303.60,-47.290,-13.540,-2.846,6.803,62.022,176.21,,,
3,CHEMBL4649972,PERCENTEFFECT,%,68616,0,0,quantitative,68610.0,25.0;50.0;75.0,0.004;0.0;0.0,-4329.36,-46.070,-9.429,-3.115,2.931,20.768,97.51,,,
4,CHEMBL4649941,PERCENTEFFECT,%,67381,0,0,quantitative,66941.0,25.0;50.0;75.0,0.003;0.001;0.0,-254.55,-28.112,-1.211,1.844,4.841,17.390,101.82,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10527,CHEMBL4153752,INHIBITION,%,1,0,0,qualitative,,,,,,,,,,,1.0,1.0,1.0
10528,CHEMBL4153751,INHIBITION,%,1,0,0,qualitative,,,,,,,,,,,1.0,1.0,1.0
10529,CHEMBL4153750,INHIBITION,%,1,0,0,qualitative,,,,,,,,,,,1.0,1.0,1.0
10530,CHEMBL5226835,MIC50,umol.L-1,1,0,0,none,1.0,,,33.00,33.000,33.000,33.000,33.000,33.000,33.00,,,


In [54]:
ASSAYS_DATASETS

Unnamed: 0,assay_id,activity_type,unit,equal,higher,lower,dataset_type,a,cpds_qt,min_,p1,p25,p50,p75,p99,max_,pos_ql,ratio_ql,cpds_ql
0,CHEMBL4649948,PERCENTEFFECT,%,93555,0,0,quantitative,,86589.0,-1122.89,-39.791,-10.300,-1.066,7.879,58.950,120.27,,,
1,CHEMBL4649949,PERCENTEFFECT,%,101515,0,0,quantitative,,86575.0,-1111.40,-46.433,-6.945,3.342,14.630,66.173,133.09,,,
2,CHEMBL4649971,PERCENTEFFECT,%,68619,0,0,quantitative,,68613.0,-303.60,-47.290,-13.540,-2.846,6.803,62.022,176.21,,,
3,CHEMBL4649972,PERCENTEFFECT,%,68616,0,0,quantitative,,68610.0,-4329.36,-46.070,-9.429,-3.115,2.931,20.768,97.51,,,
4,CHEMBL4649941,PERCENTEFFECT,%,67381,0,0,quantitative,,66941.0,-254.55,-28.112,-1.211,1.844,4.841,17.390,101.82,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10527,CHEMBL4153752,INHIBITION,%,1,0,0,qualitative,,,,,,,,,,1.0,1.0,1.0
10528,CHEMBL4153751,INHIBITION,%,1,0,0,qualitative,,,,,,,,,,1.0,1.0,1.0
10529,CHEMBL4153750,INHIBITION,%,1,0,0,qualitative,,,,,,,,,,1.0,1.0,1.0
10530,CHEMBL5226835,MIC50,umol.L-1,1,0,0,none,,1.0,33.00,33.000,33.000,33.000,33.000,33.000,33.00,,,
