In [1]:
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 300
from collections import Counter
from tqdm import tqdm
import pandas as pd
import numpy as np
import sys
import os

# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import CONFIGPATH

# ChEMBL_ALL = pd.read_csv(os.path.join(root, "..", "config", "chembl_processed", "activities_all_raw.csv"), low_memory=False)
# # print(len(ChEMBL_ALL[(ChEMBL_ALL['assay_chembl_id'] == "CHEMBL1909088") & (ChEMBL_ALL['standard_type'] == "IC50") & (ChEMBL_ALL['canonical_smiles'].isna())]))

# Loading ChEMBL preprocessed data
print("Loading ChEMBL preprocessed data...")
ChEMBL = pd.read_csv(os.path.join(root, "..", "config", "chembl_processed", "activities_preprocessed.csv"), low_memory=False)

Loading ChEMBL preprocessed data...


In [3]:
print(f"Number of activities: {len(ChEMBL)}")
print(f"Number of activities with non nan values: {len(ChEMBL[ChEMBL['value'].isna() == False])}")
print(f"Number of unique compounds: {len(set(ChEMBL['compound_chembl_id']))}")
print(f"Number of unique assays: {len(set(ChEMBL['assay_chembl_id']))}")
print(f'Assay types: {Counter(ChEMBL[["assay_chembl_id", "assay_type"]].drop_duplicates(subset="assay_chembl_id")["assay_type"])}')
print(f"Number of unique targets: {len(set(ChEMBL['target_chembl_id']))}")
print(f'Target types: {Counter(ChEMBL[["target_chembl_id", "target_type"]].drop_duplicates(subset="target_chembl_id")["target_type"])}')
print(f"Relations: {Counter(ChEMBL['relation'])}")
print(f"Original pChEMBL: {len(ChEMBL[ChEMBL['pchembl'].isna() == False])}")
print(f"Calculated pChEMBL: {len(ChEMBL[ChEMBL['pchembl_calculated'].isna() == False])}")

thr = 0.01
ChEMBL_filtered = ChEMBL[(ChEMBL['pchembl'].isna() == False) & (ChEMBL['pchembl_calculated'].isna() == False)].copy()
ChEMBL_filtered['pchembl'] = ChEMBL_filtered['pchembl'].clip(lower=1, upper=9)
ChEMBL_filtered = ChEMBL_filtered[["pchembl", "pchembl_calculated"]]
ChEMBL_filtered['difference'] = [np.abs(i-j) for i,j in zip(ChEMBL_filtered['pchembl'], ChEMBL_filtered['pchembl_calculated'])]
perc = len(ChEMBL_filtered[ChEMBL_filtered['difference'] < thr]) * 100 / len(ChEMBL_filtered)
print(f"Percentage of pChEMBL original vs calculated having difference < {thr}: {perc}")

Number of activities: 24040987
Number of activities with non nan values: 20718584
Number of unique compounds: 2756399
Number of unique assays: 1884335
Assay types: Counter({'F': 880634, 'B': 595493, 'A': 309873, 'T': 68068, 'P': 26580, 'U': 3687})
Number of unique targets: 17300
Target types: Counter({'SINGLE PROTEIN': 10727, 'ORGANISM': 2381, 'CELL-LINE': 1984, 'PROTEIN COMPLEX': 599, 'PROTEIN-PROTEIN INTERACTION': 595, 'PROTEIN FAMILY': 393, 'TISSUE': 294, 'SELECTIVITY GROUP': 123, 'NUCLEIC-ACID': 56, 'PROTEIN COMPLEX GROUP': 55, 'CHIMERIC PROTEIN': 34, 'UNKNOWN': 16, 'SUBCELLULAR': 12, 'MACROMOLECULE': 7, 'PROTEIN NUCLEIC-ACID COMPLEX': 6, '3D CELL CULTURE': 6, 'SMALL MOLECULE': 4, 'PHENOTYPE': 2, 'NO TARGET': 1, 'ADMET': 1, 'NON-MOLECULAR': 1, 'UNCHECKED': 1, 'LIPID': 1, 'OLIGOSACCHARIDE': 1})
Relations: Counter({'=': 22097792, '>': 1569365, '<': 373830})
Original pChEMBL: 4887128
Calculated pChEMBL: 13712063
Percentage of pChEMBL original vs calculated having difference < 0.01: 10

In [12]:
# a = ChEMBL[(ChEMBL['pchembl_calculated'].isna() == False) & (ChEMBL['pchembl'].isna() == True)]

In [13]:
LABEL = 'activity_type'
print(f"Number of standard activity types: {len(set(ChEMBL[LABEL]))}")
s = ChEMBL[[LABEL]].astype("string").fillna("")
out = (s.value_counts(dropna=False).reset_index(name="count").sort_values("count", ascending=False, ignore_index=True))
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)
print(f"100 types coverage: {out['cumulative_prop'].tolist()[100]}")

Number of standard activity types: 6015
100 types coverage: 0.951


In [14]:
LABEL = 'unit'
print(f"Number of units: {len(set(ChEMBL[LABEL]))}")
s = ChEMBL[[LABEL]].astype("string").fillna("")
out = (s.value_counts(dropna=False).reset_index(name="count").sort_values("count", ascending=False, ignore_index=True))
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)
print(f"100 units coverage: {out['cumulative_prop'].tolist()[100]}")

Number of units: 2502
100 units coverage: 0.997


In [15]:
LABEL = ['activity_type', 'unit']
s = pd.DataFrame([str(i) + " -- " + str(j) for i,j in zip(ChEMBL[LABEL[0]], ChEMBL[LABEL[1]])])
out = (s.value_counts(dropna=False).reset_index(name="count").sort_values("count", ascending=False, ignore_index=True))
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)
print(f"Number of act-unit pairs: {len(out)}")
print(f"100 pairs coverage: {out['cumulative_prop'].tolist()[100]}")

Number of act-unit pairs: 12686
100 pairs coverage: 0.935


In [16]:
s = ChEMBL[["activity_type", "unit"]].astype("string").fillna("")
out = (
s.value_counts(subset=["activity_type", "unit"], dropna=False)
    .reset_index(name="count")
    .sort_values("count", ascending=False, ignore_index=True)
)
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)
print(f"Number of activity entries having no associated unit: {sum(out[out['unit'] == '']['count'])}")

Number of activity entries having no associated unit: 3329218


In [17]:
ChEMBL[(ChEMBL['unit'].isna()) & (ChEMBL['pchembl'].isna() == False)]

Unnamed: 0,activity_id,assay_id,assay_chembl_id,assay_type,assay_confidence_score,assay_organism,doc_chembl_id,tid,target_type,target_organism,...,MW,pchembl,bao_endpoint,activity_comment,standard_text,value,unit,activity_type,relation,pchembl_calculated


In [34]:
ACT = "ACTIVITY"
ACT = 'IC50'
filtered = ChEMBL[(ChEMBL['activity_type'] == ACT) & (ChEMBL['unit'].isna())]
out = (
filtered.value_counts(subset=["doc_chembl_id"], dropna=False)
    .reset_index(name="count")
    .sort_values("count", ascending=False, ignore_index=True)
)
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)
print(len(filtered))
out

191326


Unnamed: 0,doc_chembl_id,count,cumulative_prop
0,CHEMBL1909046,106381,0.556
1,CHEMBL1212812,3228,0.573
2,CHEMBL3882724,853,0.577
3,CHEMBL4420070,497,0.580
4,CHEMBL1240340,421,0.582
...,...,...,...
8546,CHEMBL1149704,1,1.000
8547,CHEMBL1141821,1,1.000
8548,CHEMBL5518206,1,1.000
8549,CHEMBL4251657,1,1.000


In [35]:
out = (
filtered.value_counts(subset=["assay_chembl_id"], dropna=False)
    .reset_index(name="count")
    .sort_values("count", ascending=False, ignore_index=True)
)
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)
out

Unnamed: 0,assay_chembl_id,count,cumulative_prop
0,CHEMBL1909160,835,0.004
1,CHEMBL1909152,835,0.009
2,CHEMBL1909175,835,0.013
3,CHEMBL1909101,835,0.017
4,CHEMBL1909179,835,0.022
...,...,...,...
17463,CHEMBL1036897,1,1.000
17464,CHEMBL952882,1,1.000
17465,CHEMBL799073,1,1.000
17466,CHEMBL1669852,1,1.000


In [36]:
print(len(filtered[filtered['value'].isna()]))

190636


In [37]:
print(len(filtered[(filtered['activity_comment'] != 0) | (filtered['activity_comment'] != 0)]))

121207


In [38]:
assays_all = set([tuple(i) for i in ChEMBL[['assay_chembl_id', 'activity_type', 'unit']].values])
act_nans = [i for i in assays_all if i[1] == ACT and type(i[2]) != str]
len(assays_all), len(act_nans)

(2022786, 17468)

In [39]:
COUNTING_ASSAYS = {}
for i in tqdm(assays_all):
    if i[0] not in COUNTING_ASSAYS:
        COUNTING_ASSAYS[i[0]] = {}
    if i[1] not in COUNTING_ASSAYS[i[0]]:
        COUNTING_ASSAYS[i[0]][i[1]] = set()
    COUNTING_ASSAYS[i[0]][i[1]].add(i[2])

100%|██████████| 2022786/2022786 [00:05<00:00, 401025.81it/s]


In [40]:
count_degenerated_assays = set()
for a in act_nans:
    if len(COUNTING_ASSAYS[a[0]][a[1]]) > 1:
        count_degenerated_assays.add(a)
len(count_degenerated_assays)

16406

In [None]:
### FILTERING FOR PATHOGENS ###

def get_pathogen_code(pathogen):
    return str(pathogen.split()[0][0] + pathogen.split()[1]).lower() if len(pathogen.split()) > 1 else pathogen.lower()

In [None]:
# List of pathogens
pathogens = ["Acinetobacter baumannii", "Candida albicans", "Campylobacter", "Escherichia coli", "Enterococcus faecium", "Enterobacter",
             "Helicobacter pylori", "Klebsiella pneumoniae", "Mycobacterium tuberculosis", "Neisseria gonorrhoeae", "Pseudomonas aeruginosa",
             "Plasmodium falciparum", "Staphylococcus aureus", "Schistosoma mansoni", "Streptococcus pneumoniae"]

RESULTS = {}
# For each pathogen
pathogens = np.array(pathogens)
for pathogen in pathogens:

    print(f"Filtering for pathogen: {pathogen}...")
    pathogen_code = get_pathogen_code(pathogen)
    pathogen_data = ChEMBL[ChEMBL['target_organism'].str.contains(pathogen, case=False, na=False) | 
                    ChEMBL['assay_organism'].str.contains(pathogen, case=False, na=False)].reset_index(drop=True)

    print(f"Number of activities: {len(pathogen_data)}")
    print(f"Number of unique compounds: {len(set(pathogen_data['compound_chembl_id']))}")
    RESULTS[pathogen_code] = [len(pathogen_data), len(set(pathogen_data['compound_chembl_id']))]
    print("\n")

In [None]:
plt.figure(figsize=(8, 4))

x = [i for i in range(len(pathogens))]
inds = np.argsort([RESULTS[get_pathogen_code(i)][1] for i in pathogens])[::-1]
plt.bar(x, [RESULTS[get_pathogen_code(i)][0] for i in pathogens[inds]], zorder=2, ec='k', color="#AA96FA", label='Activities')
plt.bar(x, [RESULTS[get_pathogen_code(i)][1] for i in pathogens[inds]], zorder=2, ec='k', color="#FAD782", label='Compounds')

plt.ylim([10 ** 2, 2*10 ** 6])
plt.yscale('log')
plt.ylabel("Number of")
plt.xticks(x, [get_pathogen_code(i) for i in pathogens[inds]], rotation=45, ha='right')
plt.grid(linestyle='--')
plt.legend(loc='upper right', framealpha=1, edgecolor='k', prop={'size': 9})
plt.show()

In [None]:
# Filtering data for pathogens
pathogen_data = ChEMBL[ChEMBL['target_organism'].str.lower().isin([p.lower() for p in pathogens]) | 
                       ChEMBL['assay_organism'].str.lower().isin([p.lower() for p in pathogens])].reset_index(drop=True)

In [None]:
print(f"Number of activities: {len(pathogen_data)}")
print(f"Number of activities with non nan values: {len(pathogen_data[pathogen_data['value'].isna() == False])}")
print(f"Number of unique compounds: {len(set(pathogen_data['compound_chembl_id']))}")
print(f"Number of unique assays: {len(set(pathogen_data['assay_chembl_id']))}")
print(f'Assay types: {Counter(pathogen_data[["assay_chembl_id", "assay_type"]].drop_duplicates(subset="assay_chembl_id")["assay_type"])}')
print(f"Number of unique targets: {len(set(pathogen_data['target_chembl_id']))}")
print(f'Target types: {Counter(pathogen_data[["target_chembl_id", "target_type"]].drop_duplicates(subset="target_chembl_id")["target_type"])}')
print(f"Relations: {Counter(pathogen_data['relation'])}")
print(f"Original pChEMBL: {len(pathogen_data[pathogen_data['pchembl'].isna() == False])}")
print(f"Calculated pChEMBL: {len(pathogen_data[pathogen_data['pchembl_calculated'].isna() == False])}")

thr = 0.01
pathogen_data_filtered = pathogen_data[(pathogen_data['pchembl'].isna() == False) & (pathogen_data['pchembl_calculated'].isna() == False)].copy()
pathogen_data_filtered['pchembl'] = pathogen_data_filtered['pchembl'].clip(lower=1, upper=9)
pathogen_data_filtered = pathogen_data_filtered[["pchembl", "pchembl_calculated"]]
pathogen_data_filtered['difference'] = [np.abs(i-j) for i,j in zip(pathogen_data_filtered['pchembl'], pathogen_data_filtered['pchembl_calculated'])]
perc = len(pathogen_data_filtered[pathogen_data_filtered['difference'] < thr]) * 100 / len(pathogen_data_filtered)
print(f"Percentage of pChEMBL original vs calculated having difference < {thr}: {perc}")

In [None]:
LABEL = 'activity_type'
print(f"Number of standard activity types: {len(set(pathogen_data[LABEL]))}")
s = pathogen_data[[LABEL]].astype("string").fillna("")
out = (s.value_counts(dropna=False).reset_index(name="count").sort_values("count", ascending=False, ignore_index=True))
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)
print(f"100 types coverage: {out['cumulative_prop'].tolist()[100]}")

In [None]:
LABEL = 'unit'
print(f"Number of units: {len(set(pathogen_data[LABEL]))}")
s = pathogen_data[[LABEL]].astype("string").fillna("")
out = (s.value_counts(dropna=False).reset_index(name="count").sort_values("count", ascending=False, ignore_index=True))
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)
print(f"100 units coverage: {out['cumulative_prop'].tolist()[100]}")

In [None]:
LABEL = ['activity_type', 'unit']
s = pd.DataFrame([str(i) + " -- " + str(j) for i,j in zip(pathogen_data[LABEL[0]], pathogen_data[LABEL[1]])])
out = (s.value_counts(dropna=False).reset_index(name="count").sort_values("count", ascending=False, ignore_index=True))
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)
print(f"Number of act-unit pairs: {len(out)}")
print(f"100 pairs coverage: {out['cumulative_prop'].tolist()[100]}")

In [None]:
out[:10]

In [None]:
# From Assay ChEMBL ID to number of nans / non-nans
PATHOGEN_ASSAYS = set(pathogen_data['assay_chembl_id'])
PATHOGEN_ASSAYS = {i: {'nans': set(), 'non-nans': set()} for i in PATHOGEN_ASSAYS}

# Collect info
for activity_id, assay_chembl_id, value in tqdm(zip(pathogen_data['activity_id'], pathogen_data['assay_chembl_id'], pathogen_data['value'])):
    if np.isnan(value):
        PATHOGEN_ASSAYS[assay_chembl_id]['nans'].add(activity_id)
    else:
        PATHOGEN_ASSAYS[assay_chembl_id]['non-nans'].add(activity_id)

# Assays with at least one nan
assays_nan_values = Counter(pathogen_data[pathogen_data['value'].isna()]['assay_chembl_id'])
print(f"Number of assays with at least one nan value: {len(assays_nan_values)}")
assert len(assays_nan_values) == len([i for i in PATHOGEN_ASSAYS if len(PATHOGEN_ASSAYS[i]['nans']) > 0])

In [None]:
# Calculate proportion of nans
proportion_nans = {}
for assay in sorted(PATHOGEN_ASSAYS):
    if len(PATHOGEN_ASSAYS[assay]['nans']) > 0:
        proportion_nans[assay] = len(PATHOGEN_ASSAYS[assay]['nans']) / (len(PATHOGEN_ASSAYS[assay]['nans']) + len(PATHOGEN_ASSAYS[assay]['non-nans']))

print(len(proportion_nans))

In [None]:
len([i for i in proportion_nans if (len(PATHOGEN_ASSAYS[i]['nans']) + len (PATHOGEN_ASSAYS[i]['non-nans'])) == 1])

In [None]:
len([i for i in proportion_nans if (len(PATHOGEN_ASSAYS[i]['nans']) + len (PATHOGEN_ASSAYS[i]['non-nans'])) > 1 and proportion_nans[i] > 0.9])

In [None]:
# pathogen_data[pathogen_data['assay_chembl_id'] == "CHEMBL1003929"]