This notebook takes the results from the broad assay category model predictions ChEMBL 35 and formats the results for a better output file for inspecting, further use and presentation

In [26]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
results = pd.read_csv('chembl_35_broad_results.txt', sep='\t')

In [3]:
results.head()

Unnamed: 0.1,Unnamed: 0,assay_id,year,description,assay_type,cats
0,0,89,1994.0,Evaluated for its activity to inhibit rat live...,B,{'Radioligand binding (BAO_0002776)': 0.000140...
1,1,235,2003.0,Inhibitory activity against inosine 5'-inosine...,B,{'Radioligand binding (BAO_0002776)': 4.709044...
2,2,667,2003.0,5-hydroxytryptamine 1A receptor antagonism was...,F,{'Radioligand binding (BAO_0002776)': 0.749271...
3,3,758,2001.0,Binding affinity towards human 5-hydroxytrypta...,B,{'Radioligand binding (BAO_0002776)': 0.001449...
4,4,1155,1992.0,Compound was evaluated for in vitro binding af...,B,{'Radioligand binding (BAO_0002776)': 0.999997...


In [4]:
# This cell takes a few minutes
results['cats_df'] = results['cats'].apply(lambda x: eval(x))

In [5]:
results.head()

Unnamed: 0.1,Unnamed: 0,assay_id,year,description,assay_type,cats,cats_df
0,0,89,1994.0,Evaluated for its activity to inhibit rat live...,B,{'Radioligand binding (BAO_0002776)': 0.000140...,{'Radioligand binding (BAO_0002776)': 0.000140...
1,1,235,2003.0,Inhibitory activity against inosine 5'-inosine...,B,{'Radioligand binding (BAO_0002776)': 4.709044...,{'Radioligand binding (BAO_0002776)': 4.709044...
2,2,667,2003.0,5-hydroxytryptamine 1A receptor antagonism was...,F,{'Radioligand binding (BAO_0002776)': 0.749271...,{'Radioligand binding (BAO_0002776)': 0.749271...
3,3,758,2001.0,Binding affinity towards human 5-hydroxytrypta...,B,{'Radioligand binding (BAO_0002776)': 0.001449...,{'Radioligand binding (BAO_0002776)': 0.001449...
4,4,1155,1992.0,Compound was evaluated for in vitro binding af...,B,{'Radioligand binding (BAO_0002776)': 0.999997...,{'Radioligand binding (BAO_0002776)': 0.999997...


In [6]:
def find_cat_prediction(x, column_name):
    prediction_value = x[column_name]
    return prediction_value

In [7]:
keys = results.iloc[0]['cats_df'].keys()
keys

dict_keys(['Radioligand binding (BAO_0002776)', 'Binding (BAO_0002989)', 'Protein activity (BAO_0013016)', 'in vivo method (BAO_0040021)', 'Cell phenotype (BAO_0002542)', 'Nucleic acid binding', 'Antimicrobial activity'])

In [8]:
for column_name in keys:
    print(column_name)

Radioligand binding (BAO_0002776)
Binding (BAO_0002989)
Protein activity (BAO_0013016)
in vivo method (BAO_0040021)
Cell phenotype (BAO_0002542)
Nucleic acid binding
Antimicrobial activity


In [9]:
for column_name in keys:
    results[column_name] = results['cats_df'].apply(lambda x: find_cat_prediction(x, column_name))

In [10]:
results.head()

Unnamed: 0.1,Unnamed: 0,assay_id,year,description,assay_type,cats,cats_df,Radioligand binding (BAO_0002776),Binding (BAO_0002989),Protein activity (BAO_0013016),in vivo method (BAO_0040021),Cell phenotype (BAO_0002542),Nucleic acid binding,Antimicrobial activity
0,0,89,1994.0,Evaluated for its activity to inhibit rat live...,B,{'Radioligand binding (BAO_0002776)': 0.000140...,{'Radioligand binding (BAO_0002776)': 0.000140...,0.000141,0.020719,0.820549,0.00034,0.000403,0.000183,0.004031
1,1,235,2003.0,Inhibitory activity against inosine 5'-inosine...,B,{'Radioligand binding (BAO_0002776)': 4.709044...,{'Radioligand binding (BAO_0002776)': 4.709044...,4.7e-05,0.000101,0.760096,3.4e-05,0.36885,1.2e-05,0.003083
2,2,667,2003.0,5-hydroxytryptamine 1A receptor antagonism was...,F,{'Radioligand binding (BAO_0002776)': 0.749271...,{'Radioligand binding (BAO_0002776)': 0.749271...,0.749272,0.003896,0.000237,0.000165,0.005068,0.005397,7.2e-05
3,3,758,2001.0,Binding affinity towards human 5-hydroxytrypta...,B,{'Radioligand binding (BAO_0002776)': 0.001449...,{'Radioligand binding (BAO_0002776)': 0.001449...,0.00145,0.999711,7e-06,6.3e-05,1.3e-05,0.000615,0.000136
4,4,1155,1992.0,Compound was evaluated for in vitro binding af...,B,{'Radioligand binding (BAO_0002776)': 0.999997...,{'Radioligand binding (BAO_0002776)': 0.999997...,0.999998,1e-05,3e-05,0.001088,5e-06,3.1e-05,0.000692


In [11]:
results['predicted_cat'] = results['cats_df'].apply(lambda x: max(x.items(), key=lambda k: k[1])[0])
results['prediction_score'] = results['cats_df'].apply(lambda x: max(x.items(), key=lambda k: k[1])[1])

In [12]:
results['second_largest_score'] = results['cats_df'].apply(lambda x: sorted(x.values())[-2]) # Get the second largest value

In [20]:
results['predicted_category'] = results['predicted_cat'].apply(lambda x: x.split(' (')[0])

In [40]:
results['predicted_cat'].drop_duplicates()

0         Protein activity (BAO_0013016)
2      Radioligand binding (BAO_0002776)
3                  Binding (BAO_0002989)
21          Cell phenotype (BAO_0002542)
33          in vivo method (BAO_0040021)
41                Antimicrobial activity
147                 Nucleic acid binding
Name: predicted_cat, dtype: object

In [41]:
results['predicted_bao_id'] = results['predicted_cat'].apply(lambda x: x.split(' (')[1].rstrip(')') if '(' in x else np.nan)

In [42]:
bao_term_dict = {
    'BAO_0013016': 'functional target-based'
    , 'BAO_0002989': 'binding assay'
    , 'BAO_0002776': 'radioligand binding assay'
    , 'BAO_0002542': 'cell phenotype'
    , 'BAO_0040021': 'in vivo assay method'
}

In [43]:
# Fill in bao preferred terms
results['predicted_bao_term'] = results['predicted_bao_id'].apply(lambda x: bao_term_dict[x] if x is not np.nan else np.nan)

In [44]:
results[['predicted_category', 'predicted_bao_term', 'predicted_bao_id']].drop_duplicates()

Unnamed: 0,predicted_category,predicted_bao_term,predicted_bao_id
0,Protein activity,functional target-based,BAO_0013016
2,Radioligand binding,radioligand binding assay,BAO_0002776
3,Binding,binding assay,BAO_0002989
21,Cell phenotype,cell phenotype,BAO_0002542
33,in vivo method,in vivo assay method,BAO_0040021
41,Antimicrobial activity,,
147,Nucleic acid binding,,


In [45]:
results.columns

Index(['Unnamed: 0', 'assay_id', 'year', 'description', 'assay_type', 'cats',
       'cats_df', 'Radioligand binding (BAO_0002776)', 'Binding (BAO_0002989)',
       'Protein activity (BAO_0013016)', 'in vivo method (BAO_0040021)',
       'Cell phenotype (BAO_0002542)', 'Nucleic acid binding',
       'Antimicrobial activity', 'predicted_cat', 'prediction_score',
       'second_largest_score', 'predicted_category', 'predicted_bao_id',
       'predicted_bao_term'],
      dtype='object')

In [46]:
chosen_columns = ['assay_id'
                  , 'year'
                  , 'description'
                  , 'assay_type'
                  , 'predicted_category'
                  , 'prediction_score'
                  , 'predicted_bao_id'
                  , 'predicted_bao_term'
                  , 'Radioligand binding (BAO_0002776)'
                  , 'Binding (BAO_0002989)'
                  , 'Protein activity (BAO_0013016)'
                  , 'in vivo method (BAO_0040021)'
                  , 'Cell phenotype (BAO_0002542)'
                  , 'Nucleic acid binding'
                  , 'Antimicrobial activity'
                 ]

In [48]:
results[chosen_columns].head()

Unnamed: 0,assay_id,year,description,assay_type,predicted_category,prediction_score,predicted_bao_id,predicted_bao_term,Radioligand binding (BAO_0002776),Binding (BAO_0002989),Protein activity (BAO_0013016),in vivo method (BAO_0040021),Cell phenotype (BAO_0002542),Nucleic acid binding,Antimicrobial activity
0,89,1994.0,Evaluated for its activity to inhibit rat live...,B,Protein activity,0.820549,BAO_0013016,functional target-based,0.000141,0.020719,0.820549,0.00034,0.000403,0.000183,0.004031
1,235,2003.0,Inhibitory activity against inosine 5'-inosine...,B,Protein activity,0.760096,BAO_0013016,functional target-based,4.7e-05,0.000101,0.760096,3.4e-05,0.36885,1.2e-05,0.003083
2,667,2003.0,5-hydroxytryptamine 1A receptor antagonism was...,F,Radioligand binding,0.749272,BAO_0002776,radioligand binding assay,0.749272,0.003896,0.000237,0.000165,0.005068,0.005397,7.2e-05
3,758,2001.0,Binding affinity towards human 5-hydroxytrypta...,B,Binding,0.999711,BAO_0002989,binding assay,0.00145,0.999711,7e-06,6.3e-05,1.3e-05,0.000615,0.000136
4,1155,1992.0,Compound was evaluated for in vitro binding af...,B,Radioligand binding,0.999998,BAO_0002776,radioligand binding assay,0.999998,1e-05,3e-05,0.001088,5e-06,3.1e-05,0.000692


In [49]:
results[chosen_columns].to_csv('chembl_35_broad_results_processed.txt', sep='\t', index=False)