In [3]:
import os
import pandas as pd

DATAPATH = "../data"
CONFIGPATH = "../config"

# Get units to standardise
Get all the units used in the assays to standardise them according to UCUM guides

In [9]:
allunits = []

file_list = os.listdir(os.path.join(DATAPATH, "pathogen_original"))
for filename in file_list:
    df = pd.read_csv(os.path.join(DATAPATH, "pathogen_original", filename), low_memory=False)
    units = list(set(df["standard_units"]))
    allunits.extend(units)
allunits = list(set(allunits))
df = pd.DataFrame({"units": allunits})
df.to_csv(os.path.join(CONFIGPATH, "units.csv"), index=False) #this file will be processed manually in the UCUM website

In [58]:
#use this to check for specific assays in the downloaded data
file_list = os.listdir(os.path.join(DATAPATH, "pathogen_original"))
for filename in file_list:
    df = pd.read_csv(os.path.join(DATAPATH, "pathogen_original", filename), low_memory=False)
    print(filename)
    print(df["assay_chembl_id"][df["standard_type"]=="Percent Effect"])

mtuberculosis.csv
35203     CHEMBL4649971
35204     CHEMBL4649961
35268     CHEMBL4649971
35365     CHEMBL4649971
35366     CHEMBL4649961
              ...      
474920    CHEMBL4649972
474921    CHEMBL4649972
474922    CHEMBL4649972
474923    CHEMBL4649972
474924    CHEMBL4649972
Name: assay_chembl_id, Length: 398254, dtype: object
enterobacter.csv
Series([], Name: assay_chembl_id, dtype: object)
saureus.csv
Series([], Name: assay_chembl_id, dtype: object)
paeruginosa.csv
Series([], Name: assay_chembl_id, dtype: object)
abaumannii.csv
Series([], Name: assay_chembl_id, dtype: object)
kpneumoniae.csv
Series([], Name: assay_chembl_id, dtype: object)
efaecium.csv
Series([], Name: assay_chembl_id, dtype: object)
pfalciparum.csv
12603     CHEMBL4649943
12604     CHEMBL4649943
12605     CHEMBL4649943
12609     CHEMBL4649943
12610     CHEMBL4649943
              ...      
499887    CHEMBL4649945
499890    CHEMBL4649945
499893    CHEMBL4649945
499896    CHEMBL4649945
499899    CHEMBL4649964
Na

### Create functions to automatically convert units

In [1]:
f = "lambda x, y: x*y/1000"
f = eval(f)
f(3,2)

In [18]:
s = 'standard_value/(molecular_weight/1000)/1000'

def parse_function(s):
    if 'standard_value' not in s:
        return None
    if "molecular_weight" in s:
        p = "lambda x,y: "
    else:
        p = "lambda x: "
    s = s.replace("molecular_weight", "y")
    s = s.replace("standard_value", "x")
    s = p + s
    return eval(s)

f = parse_function(s)
f(1,2)

## Evaluate processed files to select relevant assays

In [100]:
file_list = os.listdir(os.path.join(DATAPATH, "pathogen_processed"))
dfs = []
for filename in file_list:
    df = pd.read_csv(os.path.join(DATAPATH, "pathogen_processed", filename), low_memory=False)
    dfs += [df]
df = pd.concat(dfs, ignore_index=True)

In [101]:
df['final_units'] = df['final_units'].fillna('N/A')  
au = df.groupby(['standard_type', 'final_units']).size().reset_index(name='count')
au_ = au[au["count"]>250] #only assays with over 250 results will be considered
au_.to_csv(os.path.join(CONFIGPATH,"st_type_summary.csv"), index=False) 

In [108]:
au = pd.read_csv(os.path.join(CONFIGPATH, "st_type_summary_manual.csv")) #DO NOT OVERWRITE THIS FILE
au = au[au["use"]!=0]

In [110]:
# Distribution of standard_value grouped by standard_type and standard_units
df_perc = df.groupby(['standard_type', 'final_units'])['final_value']\
            .describe(percentiles=[0.05, 0.1, 0.2, 0.25, 0.50, 0.75, 0.8, 0.9, 0.95])\
            .drop(columns=['mean', 'std', 'min', 'max'])\
            .sort_values(['count'], ascending=False)\
            .reset_index()

# df_value_by_unit.head(30)
# Show in alphabetical order (order case insensitive)
df_perc = df_perc[df_perc["count"]>250]
df_perc.sort_values(by=['standard_type', 'final_units'],
                                      key=lambda col: col.str.lower())

Unnamed: 0,standard_type,final_units,count,5%,10%,20%,25%,50%,75%,80%,90%,95%
44,% Control,%,324.0,71.12,77.0,83.0,85.0,94.9,100.0,100.0,100.0,100.0
31,AbsAC1000_uM,umol,505.0,1.0442,1.6982,2.6062,3.206,5.244,6.425,6.846,8.7844,11.188
34,AbsAC35,umol,443.0,0.4246,1.144,2.384,3.46,9.9,21.065,24.324,34.296,38.829
9,AC50,umol,5870.0,0.74049,1.5918,3.2044,4.02225,12.135,80.5375,118.3,380.0,380.0
8,Activity,%,6103.0,-2.3,0.0102,5.0,8.8,48.61,88.0,92.6,99.9,100.0
23,Activity,log10CFU,954.0,-0.718,0.3,1.0,1.2325,2.71,4.91,5.432,6.92,8.7105
26,Activity,log10CFU/ml,619.0,-1.754,-1.1,0.1,0.51,2.5,4.0,4.552,6.0,7.512
38,Activity,,389.0,7.6e-08,0.02,0.24,0.39,2.0,23.0,28.0,360.0,48798.332
21,Activity,umol,1073.0,0.07696989,0.542137,2.80664,4.252804,26.9,197.840374,259.391672,755.929984,3429.84163
7,EC50,umol,6735.0,0.012,0.04648,0.217,0.357,3.1,15.0,20.0,100.0,130.797613


In [111]:
au["5%"] = df_perc["5%"]
au["10%"] = df_perc["10%"]
au["20%"] = df_perc["20%"]
au["25%"] = df_perc["25%"]
au["50%"] = df_perc["50%"]
au["75%"] = df_perc["75%"]
au["80%"] = df_perc["80%"]
au["90%"] = df_perc["90%"]
au["95%"] = df_perc["95%"]

In [113]:
#threshold values are set at 10 and 50% aprox of the percentile distribution, unless there is a lot of values skewed
thresh = {
    ("AC50", "umol"):[1, 10],
    ("Activity", "%"):[50,90],
    ("Activity", "log10CFU"):[4, 10],
    ("Activity", "log10CFU/ml"):[1.5, 10],
    ("EC50", "umol"):[1.5, 10],
    ("ED50", "ug.mg-1"):[2, 10],
    ("ED50", "ug.mg-1"):[2, 10],
    ("ED50", "umol"):[1, 4],
    ("GI", "%"):[90, 50],
    ("IC50", "umol"):[2.5,10],
    ("IC90", "umol"):[2.5,10],
    ("Inhibition", "%"):[50,90],
    ("IZ", "mm"):[2,5],
    ("Ki", "umol"):[2.5,10],
    ("MBC", "umol"):[1,2.5],
    ("MBC99.9", "umol"):[1,2.5],
    ("MBEC", "umol"):[1,2.5],
    ("MEC", "umol"):[1,5],
    ("MIC", "umol"):[1,5],
    ("MIC100", "umol"):[1,2.5],
    ("MIC50", "umol"):[1,2.5],
    ("MIC80", "umol"):[1,5],
    ("MIC90", "umol"):[1,5],
    ("MIC95", "umol"):[1,5],
    ("MIC99", "umol"):[1,5], 
    ("MIC=>80", "umol"):[1,5],
    ("MIC=>90", "umol"):[1,5],
    ("MIC>90", "umol"):[1,5],
    ("MIC>99", "umol"):[1,5],
    ("Percent Effect", "%"):[50,90],
    ("Potency", "umol"):[1,5],
}

In [114]:
for i,r in au.iterrows():
    key = (r['standard_type'], r['final_units'])
    
    if key in thresh:
        au.at[i, 'low_cut'] = thresh[key][0]
        au.at[i, 'high_cut'] = thresh[key][1]
    else:
        au.at[i, 'low_cut'] = None
        au.at[i, 'high_cut'] = None

In [116]:
au.drop(columns=["comments", "use", "cut-off"], inplace=True)
au.to_csv(os.path.join(CONFIGPATH, "cutoff_config.csv"), index=False)