In [1]:
from collections import Counter
from tqdm import tqdm
import pandas as pd
import numpy as np
import sys
import os

In [2]:
# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."

def get_pathogen_code(pathogen):
    return str(pathogen.split()[0][0] + pathogen.split()[1]).lower() if len(pathogen.split()) > 1 else pathogen.lower()

# Loading ChEMBL preprocessed data
print("Loading ChEMBL preprocessed data...")
ChEMBL = pd.read_csv(os.path.join(root, "..", "config", "chembl_processed", "activities_preprocessed.csv"), low_memory=False)

# Filtering data for pathogens
print("Filtering data for pathogens...")
# List of pathogens
pathogens = ["Acinetobacter baumannii", "Candida albicans", "Campylobacter", "Escherichia coli", "Enterococcus faecium", "Enterobacter",
             "Helicobacter pylori", "Klebsiella pneumoniae", "Mycobacterium tuberculosis", "Neisseria gonorrhoeae", "Pseudomonas aeruginosa",
             "Plasmodium falciparum", "Staphylococcus aureus", "Schistosoma mansoni", "Streptococcus pneumoniae"]
pathogen_data = ChEMBL[ChEMBL['target_organism'].str.lower().isin([p.lower() for p in pathogens]) | 
                       ChEMBL['assay_organism'].str.lower().isin([p.lower() for p in pathogens])].reset_index(drop=True)

print(f"Number of activities: {len(pathogen_data)}")
print(f"Number of activities with non nan values: {len(pathogen_data[pathogen_data['value'].isna() == False])}")
print(f"Number of unique compounds: {len(set(pathogen_data['compound_chembl_id']))}")
print(f"Number of unique assays: {len(set(pathogen_data['assay_chembl_id']))}")
print(f'Assay types: {Counter(pathogen_data[["assay_chembl_id", "assay_type"]].drop_duplicates(subset="assay_chembl_id")["assay_type"])}')
print(f"Number of unique targets: {len(set(pathogen_data['target_chembl_id']))}")
print(f'Target types: {Counter(pathogen_data[["target_chembl_id", "target_type"]].drop_duplicates(subset="target_chembl_id")["target_type"])}')
print(f"Relations: {Counter(pathogen_data['relation'])}")
print(f"Original pChEMBL: {len(pathogen_data[pathogen_data['pchembl'].isna() == False])}")
print(f"Calculated pChEMBL: {len(pathogen_data[pathogen_data['pchembl_calculated'].isna() == False])}")

thr = 0.01
pathogen_data_filtered = pathogen_data[(pathogen_data['pchembl'].isna() == False) & (pathogen_data['pchembl_calculated'].isna() == False)].copy()
pathogen_data_filtered['pchembl'] = pathogen_data_filtered['pchembl'].clip(lower=1, upper=9)
pathogen_data_filtered = pathogen_data_filtered[["pchembl", "pchembl_calculated"]]
pathogen_data_filtered['difference'] = [np.abs(i-j) for i,j in zip(pathogen_data_filtered['pchembl'], pathogen_data_filtered['pchembl_calculated'])]
perc = len(pathogen_data_filtered[pathogen_data_filtered['difference'] < thr]) * 100 / len(pathogen_data_filtered)
print(f"Percentage of pChEMBL original vs calculated having difference < {thr}: {perc}")

# Get directions
DIRECTIONS = pd.read_csv(os.path.join(root, "..", "config", 'manual_curation', 'activity_std_units_curated_manual_curation.csv'))
DIRECTIONS = {(i,j): k for i,j,k in zip(DIRECTIONS['activity_type'], DIRECTIONS['unit'], DIRECTIONS['manual_curation']) if np.isnan(k) == False}

s = pathogen_data[["activity_type", "unit", 'target_type']]
out = (
s.value_counts(subset=["activity_type", "unit", "target_type"], dropna=False)
    .reset_index(name="count")
    .sort_values("count", ascending=False, ignore_index=True))

out['direction'] = [DIRECTIONS[(i,j)] if (i,j) in DIRECTIONS else np.nan for i,j in out[["activity_type", "unit"]].values]
print(f"Number of activities (15 pathogens) having an [act_type, unit] pair without an assigned direction: {sum(out[out['direction'].isna() == True]['count'])}")
missing_direction = out[out['direction'].isna() == True].reset_index(drop=True)
total_count = missing_direction['count'].sum()
missing_direction['cumulative_prop'] = (missing_direction['count'].cumsum() / total_count).round(3)
out = out[out['direction'].isna() == False].reset_index(drop=True)
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)

Loading ChEMBL preprocessed data...
Filtering data for pathogens...
Number of activities: 2725613
Number of activities with non nan values: 2610858
Number of unique compounds: 710802
Number of unique assays: 128689
Assay types: Counter({'F': 111241, 'B': 16005, 'A': 1273, 'T': 151, 'U': 14, 'P': 5})
Number of unique targets: 798
Target types: Counter({'SINGLE PROTEIN': 724, 'ORGANISM': 23, 'PROTEIN COMPLEX': 17, 'PROTEIN FAMILY': 8, 'NUCLEIC-ACID': 5, 'MACROMOLECULE': 4, 'SUBCELLULAR': 4, 'PROTEIN COMPLEX GROUP': 2, 'CELL-LINE': 2, 'SMALL MOLECULE': 2, 'UNCHECKED': 1, 'NON-MOLECULAR': 1, 'NO TARGET': 1, 'ADMET': 1, 'PROTEIN NUCLEIC-ACID COMPLEX': 1, 'UNKNOWN': 1, 'LIPID': 1})
Relations: Counter({'=': 2550060, '>': 155126, '<': 20427})
Original pChEMBL: 220565
Calculated pChEMBL: 1117743
Percentage of pChEMBL original vs calculated having difference < 0.01: 100.0
Number of activities (15 pathogens) having an [act_type, unit] pair without an assigned direction: 356198


In [3]:
missing_direction

Unnamed: 0,activity_type,unit,target_type,count,direction,cumulative_prop
0,ZSCORE,,ORGANISM,147589,,0.414
1,ACTIVITY,,ORGANISM,47492,,0.548
2,XC50,umol.L-1,ORGANISM,13551,,0.586
3,MBC,umol.L-1,ORGANISM,12002,,0.619
4,MIC,,ORGANISM,10736,,0.650
...,...,...,...,...,...,...
1207,ACTIVITY,,NO TARGET,1,,1.000
1208,VM,umol.L-1.ml-1,ORGANISM,1,,1.000
1209,KD,10^-1/s,UNCHECKED,1,,1.000
1210,VMAX,RFU/min,SINGLE PROTEIN,1,,1.000


In [6]:
PREDEFINED_CUTOFFS = {
    ("MIC", "umol.L-1", "ORGANISM"): 10,
    ("MIC", "umol.L-1", "SINGLE PROTEIN"): 1,
    ("MIC50", "umol.L-1", "ORGANISM"): 10,
    ("MIC80", "umol.L-1", "ORGANISM"): 10,
    ("MIC90", "umol.L-1", "ORGANISM"): 10,
    ("MIC99", "umol.L-1", "ORGANISM"): 10,
    ("IC50", "umol.L-1", "ORGANISM"): 10,
    ("IC50", "umol.L-1", "SINGLE PROTEIN"): 1,
    ("AC50", "umol.L-1", "SINGLE PROTEIN"): 1,
    ("IC90", "umol.L-1", "ORGANISM"): 10,
    ("EC50", "umol.L-1", "ORGANISM"): 10,
    ("EC50", "umol.L-1", "SINGLE PROTEIN"): 1,
    ("POTENCY", "umol.L-1", "ORGANISM"): 10,
    ("POTENCY", "umol.L-1", "SINGLE PROTEIN"): 1,
    ("KI", "umol.L-1", "SINGLE PROTEIN"): 1,
    ("ACTIVITY", "umol.L-1", "ORGANISM"): 10,
    ("INHIBITION", "%", "ORGANISM"): 50,
    ("INHIBITION", "%", "SINGLE PROTEIN"): 50,
    ("PERCENTEFFECT", "%", "SINGLE PROTEIN"): 50,
    ("PERCENTEFFECT", "%", "ORGANISM"): 50,
    ("ACTIVITY", "%", "SINGLE PROTEIN"): 50,
    ("ACTIVITY", "%", "ORGANISM"): 50,
    ("GI", "%", "ORGANISM"): 50,
    ("IZ", "mm", "ORGANISM"): 10,
}

EXPERT_CUTOFFs = []
for activity_type, unit, target_type, direction in out[['activity_type', 'unit', 'target_type', 'direction']].values[:50]:
    for pathogen in pathogens:
        if target_type != 'UNCHECKED':
            if (activity_type, unit, target_type) in PREDEFINED_CUTOFFS:
                cutoff = PREDEFINED_CUTOFFS[(activity_type, unit, target_type)]
            else:
                cutoff = np.nan
            EXPERT_CUTOFFs.append([activity_type, unit, target_type, direction, get_pathogen_code(pathogen), cutoff])

EXPERT_CUTOFFs = pd.DataFrame(EXPERT_CUTOFFs, columns=['activity_type', 'unit', 'target_type', 'direction', 'pathogen_code', 'expert_cutoff'])
EXPERT_CUTOFFs.to_csv(os.path.join(root, "..", "config", "manual_curation", "expert_cutoffs_.csv"), index=False)