In [2]:
import pandas as pd
import json
import os
import zipfile
from IPython.display import display, HTML

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.width", 2000)
pd.set_option("display.max_colwidth", None)

display(HTML("""
<style>
.dataframe td, .dataframe th {
    white-space: nowrap !important;
}
</style>
"""))

In [3]:
# # Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."

# List of pathogens to process
pathogens = ["Acinetobacter baumannii", "Candida albicans", "Campylobacter", "Escherichia coli", "Enterococcus faecium", "Enterobacter",
             "Helicobacter pylori", "Klebsiella pneumoniae", "Mycobacterium tuberculosis", "Neisseria gonorrhoeae", "Pseudomonas aeruginosa",
             "Plasmodium falciparum", "Staphylococcus aureus", "Schistosoma mansoni", "Streptococcus pneumoniae"]
pathogens = ["Acinetobacter baumannii", "Mycobacterium tuberculosis", "Klebsiella pneumoniae"]

def get_pathogen_code(pathogen):
    return str(pathogen.split()[0][0] + pathogen.split()[1]).lower() if len(pathogen.split()) > 1 else pathogen.lower()

# Create output directory
OUTPUT = os.path.join(root, "..", "output")

# For each pathogen
for pathogen in pathogens[1:2]:

    # Get pathogen code
    pathogen_code = get_pathogen_code(pathogen)

    # Define path to parameters
    PATH_TO_PARAMETERS = os.path.join(OUTPUT, pathogen_code, "assay_parameters.zip")

    # Load assays info
    ASSAYS_CLEANED = pd.read_csv(os.path.join(OUTPUT, pathogen_code, "assays_cleaned.csv"))

    # Shared columns
    KEYS = ["assay_id", "activity_type", "unit", 'target_type']
    ASSAYS_CLEANED = ASSAYS_CLEANED[KEYS]

    ORGNISM_CURATED, TARGET_TYPE_CURATED, STRAIN, ATCC_ID, MUTATIONS, KDR, MEDIA = [], [], [], [], [], [], []

     # Inside zip file
    with zipfile.ZipFile(PATH_TO_PARAMETERS) as z:

        # Iterating over assays
        for assay_id, activity_type, unit in ASSAYS_CLEANED[['assay_id', 'activity_type', 'unit']].values:

            # Prepare filename
            filename = "_".join([str(assay_id), str(activity_type), str(unit), 'parameters']) + ".json"
            
            # Read JSON file inside zip
            with z.open(filename) as file:
                par = json.load(file)

            # Store results
            ORGNISM_CURATED.append(par['organism'])
            TARGET_TYPE_CURATED.append(par['target_type_curated'])
            STRAIN.append(par['strain'])
            ATCC_ID.append(par['atcc_id'])
            MUTATIONS.append(";".join(par['mutations']))
            KDR.append(";".join(par['known_drug_resistances']))
            MEDIA.append(par['media'])    

    # Complete table
    ASSAYS_CLEANED['organism_curated'] = ORGNISM_CURATED
    ASSAYS_CLEANED['target_type_curated'] = TARGET_TYPE_CURATED
    ASSAYS_CLEANED['strain'] = STRAIN
    ASSAYS_CLEANED['atcc_id'] = ATCC_ID
    ASSAYS_CLEANED['mutations'] = MUTATIONS
    ASSAYS_CLEANED['known_drug_resistances'] = KDR
    ASSAYS_CLEANED['media'] = MEDIA

In [4]:
ASSAYS_CLEANED[(ASSAYS_CLEANED['strain'] != "") & 
               (ASSAYS_CLEANED['atcc_id'] != "") & 
               (ASSAYS_CLEANED['mutations'] != "") ]

Unnamed: 0,assay_id,activity_type,unit,target_type,organism_curated,target_type_curated,strain,atcc_id,mutations,known_drug_resistances,media
78,CHEMBL3267866,MIC,umol.L-1,ORGANISM,Mycobacterium tuberculosis,ORGANISM,H37Rv,ATCC 27294,S315T,isoniazid,
423,CHEMBL4685498,MIC,umol.L-1,UNCHECKED,Mycobacterium tuberculosis,SINGLE PROTEIN,H37Rv,ATCC 27294,T313A,,
1038,CHEMBL4396761,MIC,umol.L-1,ORGANISM,Mycobacterium tuberculosis,ORGANISM,H37Rv,ATCC 27294,S315T,isoniazid,
1049,CHEMBL4396763,MIC,umol.L-1,ORGANISM,Mycobacterium tuberculosis,ORGANISM,INH_R1,ATCC TIB-67,S315T,isoniazid,
1804,CHEMBL4817889,MIC,umol.L-1,ORGANISM,Mycobacterium tuberculosis,ORGANISM,H37Rv,ATCC 27294,D148G,,
...,...,...,...,...,...,...,...,...,...,...,...
8947,CHEMBL4269769,MIC,umol.L-1,SINGLE PROTEIN,Mycobacterium tuberculosis,SINGLE PROTEIN,H37Rv,ATCC 25618,N364S,,
8948,CHEMBL4269768,MIC,umol.L-1,SINGLE PROTEIN,Mycobacterium tuberculosis,SINGLE PROTEIN,H37Rv,ATCC 25618,Y314C,,
8949,CHEMBL4269767,MIC,umol.L-1,SINGLE PROTEIN,Mycobacterium tuberculosis,SINGLE PROTEIN,H37Rv,ATCC 25618,Y314H,,
9533,CHEMBL4677028,MIC,umol.L-1,UNCHECKED,Mycobacterium tuberculosis,ORGANISM,H37Rv,ATCC 27294,Y156V;G93V;F203L;F203V,xanthorrhizol;triclosan,
