In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
import glob

In [2]:
raw_dat_dir = '/home/grace/Documents/python/acute_oral/data/raw/'
processed_dat_dir = '/home/grace/Documents/python/acute_oral/data/processed/'
interim_dat_dir = '/home/grace/Documents/python/acute_oral/data/interim/'
figures_dir = '/home/grace/Documents/python/acute_oral/reports/figures/'
test_dir = '/home/grace/Documents/python/acute_oral/data/external/TEST/'
times_dir = '/home/grace/Documents/python/acute_oral/data/external/TIMES/'
interim_dir = '/home/grace/Documents/python/acute_oral/data/interim/'

### Extract examples from overlap set of chemicals for the 2 models

#### Import relevant dataset of predictions from the 2 models

In [3]:
relevant_acute_pred =pd.read_csv(interim_dir+'relevant_acute_pred.csv')

In [4]:
mask2 = (~relevant_acute_pred['known_pLD50_minus_TEST'].isnull()) & (~relevant_acute_pred['known_pLD50_minus_TIMES'].isnull())
tt_overlap = relevant_acute_pred[mask2]


In [5]:
tt_overlap.shape

(274, 21)

In [6]:
tt_overlap[tt_overlap['CAS'] == '97-86-9']

Unnamed: 0.1,Unnamed: 0,CAS,known_LD50_mgkg,TEST_pLD50,chem_name,predicted_toxicological_category,known_LD50_TIMES,TIMES_LD50_mgkg,found_by,DTXSID,...,average_mass,qsar_ready_smiles,smiles,known_pLD50,TIMES_pLD50,TEST_LD50_mgkg,known_pLD50_minus_TEST,known_pLD50_minus_TIMES,TIMES,TEST
8840,8840,97-86-9,9590.0,1.61,DTXSID3025461,Methacrylic acid esters,no data,5280.0,CAS-RN,DTXSID3025461,...,142.198,CC(C)COC(=O)C(C)=C,CC(C)COC(=O)C(C)=C,1.171075,1.43026,3490.546984,-0.438925,-0.259185,Below_CI,Below_CI


In [7]:
tt_overlap.columns

Index(['Unnamed: 0', 'CAS', 'known_LD50_mgkg', 'TEST_pLD50', 'chem_name',
       'predicted_toxicological_category', 'known_LD50_TIMES',
       'TIMES_LD50_mgkg', 'found_by', 'DTXSID', 'name', 'average_mass',
       'qsar_ready_smiles', 'smiles', 'known_pLD50', 'TIMES_pLD50',
       'TEST_LD50_mgkg', 'known_pLD50_minus_TEST', 'known_pLD50_minus_TIMES',
       'TIMES', 'TEST'],
      dtype='object')

#### Pull out the identifiers

In [8]:
ids = tt_overlap.CAS.unique().tolist()

### Read in the TxP fingerprints for all the chemicals and subset for the ids identifiers from the overlap set

In [9]:
txps = pd.read_csv(raw_dat_dir+'Acute_SMILES_Toxprints.tsv', sep = '\t')

In [10]:
tt_txps = txps[txps['M_NAME'].isin(ids)]

In [11]:
txps_df = pd.melt(tt_txps, id_vars = ['M_NAME'], value_vars = tt_txps.columns.tolist()[1:])

In [12]:
txps_df = txps_df.replace(0,np.nan)
    

In [13]:
txps_df = txps_df.dropna(how = 'any')

In [14]:
txps_df = txps_df.sort_values(by = 'M_NAME', ascending = True)

#### Extract out the Txps for specific example chemical to explore derivation of the confidence metric

In [15]:
txp_chem = txps_df[txps_df['M_NAME'] == '132-32-1']['variable'].tolist()

In [16]:
txp_chem

['bond:CN_amine_pri-NH2_aromatic',
 'chain:alkaneLinear_ethyl_C2(H_gt_1)',
 'bond:CN_amine_aromatic_generic',
 'bond:CN_amine_pri-NH2_generic',
 'ring:hetero_[5]_N_pyrrole_generic',
 'ring:hetero_[5_6]_N_indole',
 'ring:hetero_[6_5_6]_N_carbazole',
 'ring:hetero_[5]_Z_1-Z',
 'bond:CN_amine_aliphatic_generic',
 'ring:hetero_[5_6]_Z_generic',
 'ring:aromatic_benzene',
 'ring:hetero_[5]_N_pyrrole']

#### Read in probabilities of TxPs as computed in notebook 03

In [17]:
pivot_probs= pd.read_csv(interim_dat_dir+'all_pivot_probs.csv')

#### For the TxPs for the specific example chemical - identify the relevant probabilities using the pivot_probs df

In [18]:
example_prob = pivot_probs[pivot_probs['TxP'].isin(txp_chem)]


#### Show the set of probabilities for the ToxPrints from both models, TEST and TIMES

In [19]:
example_prob['TEST']

68     0.344326
72     0.155189
74     0.060136
75     0.116392
256    0.454898
350    0.624636
373    0.024248
374    0.068865
385    0.124151
396    0.019399
402    0.099903
430    0.002910
Name: TEST, dtype: float64

In [20]:
example_prob['TIMES']

68     0.323718
72     0.195513
74     0.115385
75     0.166667
256    0.375000
350    0.628205
373    0.025641
374    0.032051
385    0.073718
396    0.025641
402    0.214744
430    0.016026
Name: TIMES, dtype: float64

#### Compute the probabilties of all those TxPs for both models. Probabilities are treated as independent so product of the individual probabilities is computed. Metric is just the ratio of the product of the probabiities for the respective model. This is reflected in Table 9 of the manuscript as an example.

In [21]:
np.prod(example_prob['TEST'])/np.prod(example_prob['TIMES'])

0.08105434954259275

#### Providing another worked example as shown in Table 9 of the manuscript

In [22]:
txp_chem2 = txps_df[txps_df['M_NAME'] == '28782-19-6']['variable'].tolist()

In [23]:
txp_chem2 

['ring:hetero_[6_6]_O_benzopyrone_(1_4-)',
 'group:ligand_path_4_bidentate_aminoethanol',
 'bond:C=O_carbonyl_ab-unsaturated_generic',
 'bond:CN_amine_aliphatic_generic',
 'chain:alkeneCyclic_ethene_generic',
 'chain:aromaticAlkane_Ph-C1_acyclic_generic',
 'bond:CC(=O)C_ketone_alkene_generic',
 'bond:C(=O)O_carboxylicEster_aromatic',
 'bond:CN_amine_ter-N_generic',
 'ring:aromatic_benzene',
 'ring:hetero_[6]_N_pyridine_generic',
 'bond:C=O_carbonyl_generic',
 'bond:CC(=O)C_ketone_alkene_cyclic_2-en-1-one_generic',
 'chain:alkaneLinear_ethyl_C2(H_gt_1)',
 'bond:CN_amine_alicyclic_generic',
 'ring:hetero_[6]_O_pyran_generic',
 'bond:COC_ether_aliphatic__aromatic',
 'ring:aromatic_phenyl',
 'ring:hetero_[6_6]_Z_generic',
 'ring:hetero_[6]_Z_generic',
 'bond:CC(=O)C_ketone_generic',
 'bond:CN_amine_ter-N_aliphatic',
 'chain:alkaneCyclic_ethyl_C2_(connect_noZ)',
 'bond:CC(=O)C_ketone_alkane_cyclic',
 'bond:CC(=O)C_ketone_alkene_cyclic_2-en-1-one',
 'ring:hetero_[6_6]_O_benzopyran',
 'chain:

In [24]:
example_prob2 = pivot_probs[pivot_probs['TxP'].isin(txp_chem2)]

In [25]:
print(np.prod(example_prob2['TEST']))
print(np.prod(example_prob2['TIMES']))

1.7210941764174107e-32
1.2193327876404212e-34


In [26]:
np.prod(example_prob2['TEST'])/np.prod(example_prob2['TIMES'])

141.15048769810969