In [55]:
import pandas as pd
import numpy as np
import pickle
import sys
import os

# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import CONFIGPATH

# List of pathogens to process
pathogens = ["Acinetobacter baumannii", "Candida albicans", "Campylobacter", "Escherichia coli", "Enterococcus faecium", "Enterobacter",
             "Helicobacter pylori", "Klebsiella pneumoniae", "Mycobacterium tuberculosis", "Neisseria gonorrhoeae", "Pseudomonas aeruginosa",
             "Plasmodium falciparum", "Staphylococcus aureus", "Schistosoma mansoni", "Streptococcus pneumoniae"][8:9]

def get_pathogen_code(pathogen):
    return str(pathogen.split()[0][0] + pathogen.split()[1]).lower() if len(pathogen.split()) > 1 else pathogen.lower()

# Create output directory
OUTPUT = os.path.join(root, "..", "output")

# For each pathogen
for pathogen in pathogens:

    # Loading pathogen data
    pathogen_code = get_pathogen_code(pathogen)
    print(f"Loading ChEMBL preprocessed data for {pathogen_code}...")
    ChEMBL_pathogen = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, f"{pathogen_code}_ChEMBL_raw_data.csv.gz"), low_memory=False)
    print(f"Number of activities for {pathogen_code}: {len(ChEMBL_pathogen)}")
    print(f"Number of compounds for {pathogen_code}: {len(set(ChEMBL_pathogen['compound_chembl_id']))}")
    ASSAYS_RAW = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, 'assays_raw.csv'))
    print(f"Original number of assays: {len(ASSAYS_RAW)}")

    # Converting activity types to their corresponding synonyms
    synonyms = pd.read_csv(os.path.join(root, "..", "config", "manual_curation", "synonyms.csv"))
    for activity, syns in zip(synonyms['activity'], synonyms['synonyms']):
        for syn in syns.split(";"):
            ChEMBL_pathogen.loc[ChEMBL_pathogen['activity_type'] == syn, 'activity_type'] = activity

    # Discard activities with no value nor act/inact flag in activity_comment not standard_text
    ChEMBL_pathogen = ChEMBL_pathogen[(ChEMBL_pathogen['value'].isna() == False) | 
                                    (ChEMBL_pathogen['activity_comment'] != 0) | 
                                    (ChEMBL_pathogen['standard_text'] != 0)].reset_index(drop=True)
    
    print(f"Removing activities with no value nor act/inact flag in activity_comment nor standard_test...")
    print(f"Number of activities for {pathogen_code}: {len(ChEMBL_pathogen)}")
    print(f"Number of compounds for {pathogen_code}: {len(set(ChEMBL_pathogen['compound_chembl_id']))}")

    # Identify canonical unit per activity type
    print("Identifying canonical unit per activity type...")

    break

Loading ChEMBL preprocessed data for mtuberculosis...
Number of activities for mtuberculosis: 714221
Number of compounds for mtuberculosis: 132378
Original number of assays: 13587
Removing activities with no value nor act/inact flag in activity_comment nor standard_test...
Number of activities for mtuberculosis: 705718
Number of compounds for mtuberculosis: 130771
Identifying canonical unit per activity type...


In [64]:
# Get pair counts
s = ChEMBL_pathogen[["activity_type", "unit"]]
out = (
s.value_counts(subset=["activity_type", "unit"], dropna=False)
    .reset_index(name="count")
    .sort_values("count", ascending=False, ignore_index=True))

# Keep the most occurring pairs and discard the others
idx = out.groupby("activity_type")['count'].idxmax()
out["canonical_unit"] = False
out.loc[idx, "canonical_unit"] = True
print(f"Number of unique activity type - unit pairs: {len(out)}")

# Get canonical unit per activity type
canonical = (
    out[out["canonical_unit"] == 1]
    .set_index("activity_type")[["unit"]])
canonical_map = canonical["unit"].to_dict()
ChEMBL_pathogen["canonical_unit"] = ChEMBL_pathogen["activity_type"].map(canonical_map)

# Save pair summary
out.to_csv(os.path.join(root, "..", "output", pathogen_code, "activity_type_unit_pairs.csv"), index=False)

Number of unique activity type - unit pairs: 198
