In [1]:
import pandas as pd
import numpy as np
import json
import os

In [9]:
root = "."

pathogen_code = "mtuberculosis"

# Load cleaned assays
assays_cleaned = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, "assays_cleaned.csv"))

# Load clustering infomation
clusters = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, "assays_clusters.csv"))

# Define PATH to parameters
PATH_TO_PARAMETERS = os.path.join(root, "..", "output", pathogen_code, 'parameters')

# Check that the order is the same
a = clusters[['assay_id', 'activity_type', 'unit', 'activities', 'cpds']]
b = assays_cleaned[['assay_id', 'activity_type', 'unit', 'activities', 'cpds']]
mask = (a == b) | (a.isna() & b.isna())
assert np.all(mask.to_numpy())
del a, b, mask

# Append clustering information
assays_cleaned['clusters_0.3'] = clusters['clusters_0.3']
assays_cleaned['clusters_0.6'] = clusters['clusters_0.6']
assays_cleaned['clusters_0.85'] = clusters['clusters_0.85']
# del clusters

In [10]:
ORGNISM_CURATED, TARGET_TYPE_CURATED, STRAIN = [], [], []
ATCC_ID, MUTATIONS, KDR, MEDIA = [], [], [], []

# Iterating over assays
for assay_id, activity_type, unit in assays_cleaned[['assay_id', 'activity_type', 'unit']].values:

    # Prepare filename
    filename = "_".join([str(assay_id), str(activity_type), str(unit), 'parameters']) + ".json"
    
    # Read JSON file
    with open(os.path.join(PATH_TO_PARAMETERS, filename), "r") as file:
        par = json.load(file)

    # Store results
    ORGNISM_CURATED.append(par['organism'])
    TARGET_TYPE_CURATED.append(par['target_type'])
    STRAIN.append(par['strain'])
    # ATCC_ID.append(par['atcc_id'])
    MUTATIONS.append(";".join(par['mutations']))
    KDR.append(";".join(par['known_drug_resistances']))
    MEDIA.append(par['media'])

# Complete table
assays_cleaned['organism_curated'] = ORGNISM_CURATED
assays_cleaned['target_type_curated'] = TARGET_TYPE_CURATED
assays_cleaned['strain'] = STRAIN
# assays_cleaned['atcc_id'] = ATCC_ID
assays_cleaned['mutations'] = MUTATIONS
assays_cleaned['known_drug_resistances'] = KDR
assays_cleaned['media'] = MEDIA


In [13]:
assays_cleaned[assays_cleaned['target_type'] != assays_cleaned['target_type_curated']]

Unnamed: 0,assay_id,assay_type,assay_organism,doc_chembl_id,target_type,target_chembl_id,target_organism,activity_type,unit,canonical_unit,...,standard_text_count,clusters_0.3,clusters_0.6,clusters_0.85,organism_curated,target_type_curated,strain,mutations,known_drug_resistances,media
0,CHEMBL4649948,F,Mycobacterium tuberculosis,CHEMBL3988442,UNCHECKED,CHEMBL612545,,PERCENTEFFECT,%,%,...,0,25146,72439,85917,Mycobacterium tuberculosis,ORGANISM,,,,"DPPC, cholesterol, tyloxapol based media"
1,CHEMBL4649949,F,Mycobacterium tuberculosis,CHEMBL3988442,UNCHECKED,CHEMBL612545,,PERCENTEFFECT,%,%,...,0,25056,72123,85853,Mycobacterium tuberculosis,ORGANISM,,,,"7H9, glucose tyloxapol based media"
9,CHEMBL4649949,F,Mycobacterium tuberculosis,CHEMBL3988442,UNCHECKED,CHEMBL612545,,IC50,umol.L-1,umol.L-1,...,0,1393,2340,2458,Mycobacterium tuberculosis,ORGANISM,,,,"7H9, glucose tyloxapol based media"
10,CHEMBL4649948,F,Mycobacterium tuberculosis,CHEMBL3988442,UNCHECKED,CHEMBL612545,,IC50,umol.L-1,umol.L-1,...,0,1401,2360,2457,Mycobacterium tuberculosis,ORGANISM,,,,"DPPC, cholesterol, tyloxapol based media"
67,CHEMBL4384698,B,Mycobacterium tuberculosis,CHEMBL4382257,UNCHECKED,CHEMBL612545,,IC50,umol.L-1,umol.L-1,...,0,1,22,99,Mycobacterium tuberculosis,SINGLE PROTEIN,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10484,CHEMBL5227624,B,Mycobacterium tuberculosis variant bovis,CHEMBL5226308,UNCHECKED,CHEMBL612545,,INHIBITION,%,%,...,0,1,1,1,Mycobacterium tuberculosis variant bovis,SINGLE PROTEIN,BCG,,,
10485,CHEMBL5227625,B,Mycobacterium tuberculosis variant bovis,CHEMBL5226308,UNCHECKED,CHEMBL612545,,INHIBITION,%,%,...,0,1,1,1,Mycobacterium tuberculosis variant bovis,SINGLE PROTEIN,BCG,,,
10491,CHEMBL5229091,B,Mycobacterium tuberculosis,CHEMBL5226349,UNCHECKED,CHEMBL612545,,IC50,umol.L-1,umol.L-1,...,0,1,1,1,Mycobacterium tuberculosis,SINGLE PROTEIN,,,,
10508,CHEMBL5226837,B,Mycobacterium tuberculosis variant bovis,CHEMBL5226287,UNCHECKED,CHEMBL612545,,IC50,umol.L-1,umol.L-1,...,0,1,1,1,Mycobacterium tuberculosis,SINGLE PROTEIN,BCG,,,
