In [1]:
import os
import pandas as pd

DATAPATH = "../../chembl_data" # change for your path to data
CONFIGPATH = "../config"

# Get units to standardise
Get all the units used in the assays to standardise them according to UCUM guides.

If pathogens siginificantly different from the example pathogens in config/pathogens.csv, the user migth want to revise the units manually

In [10]:
allunits = []

pathogen_list = pd.read_csv(os.path.join(CONFIGPATH, "pathogens.csv"))["pathogen_code"].tolist()
for p in pathogen_list:
    df = pd.read_csv(os.path.join(DATAPATH, f"{p}", f"{p}_original.csv"), low_memory=False)
    units = list(set(df["standard_units"]))
    allunits.extend(units)
allunits = list(set(allunits))
allunits = sorted(map(str, allunits))
df = pd.DataFrame({"units": allunits})
df.to_csv(os.path.join(CONFIGPATH, "units.csv"), index=False) #this file will be processed manually in the UCUM website: https://ucum.nlm.nih.gov/ucum-lhc/demo.html

In [23]:
#obtain ChEMBL IDs of specific assays to check in ChEMBL database
for p in pathogen_list:
    df = pd.read_csv(os.path.join(DATAPATH, f"{p}", f"{p}_original.csv"), low_memory=False)
    ch_id =  df.loc[df["standard_type"] == "Percent Effect", "assay_chembl_id"].values
    if len(ch_id) > 0:
        print(p)
        print(set(ch_id))

In [29]:
#obtain ChEMBL IDs of specific assay units to check in ChEMBL database
for p in pathogen_list:
    df = pd.read_csv(os.path.join(DATAPATH, f"{p}", f"{p}_original.csv"), low_memory=False)
    ch_id =  df.loc[df["standard_units"] == "mM l-1", "assay_chembl_id"].values
    if len(ch_id) > 0:
        print(p)
        print(set(ch_id))

saureus
{'CHEMBL731763'}


## Evaluate processed files to select relevant assays

In [90]:
dfs = []
for p in pathogen_list:
    print(p)
    df = pd.read_csv(os.path.join(DATAPATH, f"{p}", f"{p}_processed.csv"), low_memory=False)
    dfs += [df]
df = pd.concat(dfs, ignore_index=True)
df.shape

calbicans
campylobacter
ecoli
efaecium
enterobacter
hpylori
kpneumoniae
mtuberculosis
ngonorrohoeae
paeruginosa
pfalciparum
saureus
smansoni
spneumoniae


(2036217, 18)

In [91]:
df['final_unit'] = df['final_unit'].fillna('N/A')  
au = df.groupby(['standard_type', 'final_unit']).size().reset_index(name='count')
au_ = au[au["count"]>250] #only assays with over 250 results will be considered
au_.to_csv(os.path.join(CONFIGPATH,"st_type_summary.csv"), index=False) 

In [97]:
au = pd.read_csv(os.path.join(CONFIGPATH, "st_type_summary_manual.csv")) #DO NOT OVERWRITE THIS FILE
au.shape

(124, 7)

In [98]:
#remove rows with Nan values to calculate the percentiles
df_ = df[~df["final_value"].isna()]
df_.shape

(1864605, 18)

In [99]:
# Distribution of standard_value grouped by standard_type and standard_units
df_perc = df_.groupby(['standard_type', 'final_unit'])['final_value']\
            .describe(percentiles=[0.05, 0.1, 0.2, 0.25, 0.50, 0.75, 0.8, 0.9, 0.95])\
            .drop(columns=['mean', 'std', 'min', 'max'])\
            .reset_index()

# Show in alphabetical order (order case insensitive)
df_perc.sort_values(by=['standard_type', 'final_unit'], key=lambda col: col.str.lower(), inplace=True)
cols = ['standard_type', 'final_unit','5%', '10%', '20%', '25%',
       '50%', '75%', '80%', '90%', '95%']
merged_df = au.merge(df_perc[cols], on=['standard_type', 'final_unit'], how='left')
merged_df.to_csv(os.path.join(CONFIGPATH, "percentiles.csv"), index=False)