#### Creation of genotoxicity datasets

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
raw_dat_dir = '/home/grace/Documents/python/genetox/data/raw/'
processed_dat_dir = '/home/grace/Documents/python/genetox/data/processed/'
interim_dat_dir = '/home/grace/Documents/python/genetox/data/interim/'
figures_dir = '/home/grace/Documents/python/genetox/reports/figures/'


In [4]:
raw_genetox = pd.read_excel(raw_dat_dir+'combined_genetox_standard_2021-02-05.xlsx')

In [6]:
raw_genetox.shape

(54805, 31)

In [704]:
#raw_genetox.assay_type.unique()

In [479]:
#raw_genetox[raw_genetox['dtxsid'] == 'DTXSID0024896']

#### Download the DTXSIDs in order to extract SMILES and QSAR_READY_SMILES from the EPA CompTox Chemicals Dashboard

There is 5000 limit for downloads hence performed 2 searches to extract all the SMILES information with the intention of concatenating the 2 datasets together

In [11]:
dashboard_smi_1 = pd.read_csv('/home/grace/Downloads/CompToxChemicalsDashboard-Batch-Search_2021-02-08_21 09 01.csv')

In [12]:
dashboard_smi_2 = pd.read_csv('/home/grace/Downloads/CompToxChemicalsDashboard-Batch-Search_2021-02-08_21 14 30.csv')

In [13]:
dashboard_smi_1.shape

(4999, 23)

In [15]:
dashboard_smi_2.shape

(4301, 23)

In [717]:
4999+4301

9300

In [19]:
genetox_smi = pd.concat([dashboard_smi_1[['DTXSID', 'PREFERRED_NAME', 'CASRN', 'SMILES',
       'QSAR_READY_SMILES']], dashboard_smi_2[['DTXSID', 'PREFERRED_NAME', 'CASRN', 'SMILES',
       'QSAR_READY_SMILES']]], axis = 0)

In [20]:
genetox_smi.shape

(9300, 5)

In [23]:
genetox_smi = genetox_smi.iloc[1:, :]

How many substances have QSAR READY SMILES that lend them for predictions - 1788 missing SMILES and 7511 have SMILES

In [29]:
gentox_wsmi = genetox_smi[genetox_smi['QSAR_READY_SMILES'] != '-']

In [30]:
gentox_wsmi.to_csv(raw_dat_dir+'genetox_DTXSID_wsmi.csv')

In [32]:
gentox_wsmi.shape


(7511, 5)

#### Processing TEST predictions. Newest version 5.1 was used to generate consensus Ames mutagenicity predictions

In [256]:
TEST_out = pd.read_csv(interim_dat_dir+'TEST_genetox_out.csv')

In [257]:
TEST_out.shape

(7511, 9)

In [258]:
TEST_out.head()

Unnamed: 0,Index,ID,Query,SmilesRan,Error,Exp_Value,Pred_Value,Exp_Result,Pred_Result
0,1,1086-00-6,1086-00-6,ClCC1=CC=C2C=CC3=CC=CC=4C=CC1=C2C34,,1.0,0.79,Mutagenicity Positive,Mutagenicity Positive
1,2,131657-78-8,131657-78-8,O=[N+]([O-])C1=CC(Cl)=C(O)C(=C1)NCC,,,0.53,,Mutagenicity Positive
2,3,13345-26-1,13345-26-1,OC=1C=CC2=C(C1)C=C3C=CC4=CC=CC=5C=CC2=C3C45,,1.0,1.03,Mutagenicity Positive,Mutagenicity Positive
3,4,13501-76-3,13501-76-3,ClCCC[Si](OCC)(OCC)C,,,0.08,,Mutagenicity Negative
4,5,66376-36-1,66376-36-1,O=P(O)(O)C(O)(CCCN)P(=O)(O)O,,,0.38,,Mutagenicity Negative


#### Annoyingly the ID which was specified as DTXSID extracted the CASRN!

Perhaps a way to link is on the basis of the INCHI Key for the SMILES run.

In [263]:
TEST_out.ID

0                           1086-00-6
1                         131657-78-8
2                          13345-26-1
3                          13501-76-3
4                          66376-36-1
                    ...              
7506    C_MQNUWAMSQUIUHD-UHFFFAOYSA-N
7507    C_OMRMWDQPXBZWGS-UHFFFAOYSA-N
7508                     1135682-32-4
7509    C_JUUQHENXZBKKIF-UHFFFAOYSA-N
7510                       48075-52-1
Name: ID, Length: 7511, dtype: object

In [270]:
TEST_out['Query'] = TEST_out['Query'].apply(lambda x: x.replace('C_', '') )

In [480]:
TEST_out.head()

Unnamed: 0,Index,ID,Query,SmilesRan,Error,Exp_Value,Pred_Value,Exp_Result,Pred_Result,ids
0,1,1086-00-6,1086-00-6,ClCC1=CC=C2C=CC3=CC=CC=4C=CC1=C2C34,,1.0,0.79,Mutagenicity Positive,Mutagenicity Positive,DTXSID00148597
1,2,131657-78-8,131657-78-8,O=[N+]([O-])C1=CC(Cl)=C(O)C(=C1)NCC,,,0.53,,Mutagenicity Positive,DTXSID00157169
2,3,13345-26-1,13345-26-1,OC=1C=CC2=C(C1)C=C3C=CC4=CC=CC=5C=CC2=C3C45,,1.0,1.03,Mutagenicity Positive,Mutagenicity Positive,DTXSID00158116
3,4,13501-76-3,13501-76-3,ClCCC[Si](OCC)(OCC)C,,,0.08,,Mutagenicity Negative,DTXSID00159204
4,5,66376-36-1,66376-36-1,O=P(O)(O)C(O)(CCCN)P(=O)(O)O,,,0.38,,Mutagenicity Negative,missing_id


In [275]:
TEST_inchi = pd.read_csv(interim_dat_dir+'test_inchi.csv')

In [306]:
genetox_wsmi_inchi = pd.merge(gentox_wsmi, TEST_inchi, on = 'DTXSID', how = 'left')

In [307]:
genetox_wsmi_inchi = genetox_wsmi_inchi[['DTXSID', 'PREFERRED_NAME', 'CASRN', 'SMILES', 'QSAR_READY_SMILES_x','QSAR_READY_SMILES (RDKit Mol) (InChI Key)']]

In [308]:
genetox_wsmi_inchi.columns = ['DTXSID', 'PREFERRED_NAME', 'CASRN', 'SMILES', 'QSAR_READY_SMILES',
       'InChI Key']

In [297]:
#genetox_wsmi_inchi.set_index('DTXSID', inplace = True)

In [438]:
genetox_wsmi_inchi['combined'] = genetox_wsmi_inchi.apply(lambda x: list([x['CASRN'], x['InChI Key']]), axis = 1)

In [440]:
#genetox_wsmi_inchi

In [441]:
genetox_dict = {k:v for k,v in zip(genetox_wsmi_inchi['DTXSID'], genetox_wsmi_inchi['combined'])}


In [443]:
#genetox_dict

In [444]:
def get_key(val):
    for k,v in genetox_dict.items():
        if val in v:
            return k
    return 'missing_id'
    
    
    

In [467]:
TEST_inchi2 = pd.read_csv(interim_dat_dir+'test_inchi_out.csv')

In [468]:
TEST_inchi2

Unnamed: 0,ID,Query,SmilesRan,Error,Exp_Value,Pred_Value,Exp_Result,Pred_Result,QSAR_READY_SMILES (RDKit Mol),QSAR_READY_SMILES (RDKit Mol) (Kekulized),QSAR_READY_SMILES (RDKit Mol) (InChI Code),QSAR_READY_SMILES (RDKit Mol) (InChI Key)
0,1086-00-6,1086-00-6,ClCC1=CC=C2C=CC3=CC=CC=4C=CC1=C2C34,,1.0,0.79,Mutagenicity Positive,Mutagenicity Positive,ClCC1=CC=C2C=CC3=CC=CC=4C=CC1=C2C34,ClCC1=CC=C2C=CC3=CC=CC4=CC=C1C2=C34,InChI=1S/C17H11Cl/c18-10-14-7-6-13-5-4-11-2-1-...,MVNXSXOJYGSNQZ-UHFFFAOYSA-N
1,131657-78-8,131657-78-8,O=[N+]([O-])C1=CC(Cl)=C(O)C(=C1)NCC,,,0.53,,Mutagenicity Positive,O=[N+]([O-])C1=CC(Cl)=C(O)C(=C1)NCC,CCNC1=CC([N+](=O)[O-])=CC(Cl)=C1O,InChI=1S/C8H9ClN2O3/c1-2-10-7-4-5(11(13)14)3-6...,CDFNUSAXZDSXKF-UHFFFAOYSA-N
2,13345-26-1,13345-26-1,OC=1C=CC2=C(C1)C=C3C=CC4=CC=CC=5C=CC2=C3C45,,1.0,1.03,Mutagenicity Positive,Mutagenicity Positive,OC=1C=CC2=C(C1)C=C3C=CC4=CC=CC=5C=CC2=C3C45,OC1=CC=C2C(=C1)C=C1C=CC3=CC=CC4=CC=C2C1=C34,InChI=1S/C20H12O/c21-16-7-9-17-15(11-16)10-14-...,OWEZWMDNHHTXJU-UHFFFAOYSA-N
3,13501-76-3,13501-76-3,ClCCC[Si](OCC)(OCC)C,,,0.08,,Mutagenicity Negative,ClCCC[Si](OCC)(OCC)C,CCO[Si](C)(CCCCl)OCC,"InChI=1S/C8H19ClO2Si/c1-4-10-12(3,11-5-2)8-6-7...",KEZMLECYELSZDC-UHFFFAOYSA-N
4,66376-36-1,66376-36-1,O=P(O)(O)C(O)(CCCN)P(=O)(O)O,,,0.38,,Mutagenicity Negative,O=P(O)(O)C(O)(CCCN)P(=O)(O)O,NCCCC(O)(P(=O)(O)O)P(=O)(O)O,"InChI=1S/C4H13NO7P2/c5-3-1-2-4(6,13(7,8)9)14(1...",OGSPWJRAVKPPFI-UHFFFAOYSA-N
...,...,...,...,...,...,...,...,...,...,...,...,...
7506,C_MQNUWAMSQUIUHD-UHFFFAOYSA-N,C_MQNUWAMSQUIUHD-UHFFFAOYSA-N,O=S(=O)(O)C1=CC=CC=2C1=CC(N=NC=3C=C(OC)C(=CC3C...,,,0.23,,Mutagenicity Negative,O=S(=O)(O)C1=CC=CC=2C1=CC(N=NC=3C=C(OC)C(=CC3C...,COC1=C(NC2=NC(SCCCS(=O)(=O)O)=NC(SCCCS(=O)(=O)...,InChI=1S/C27H30N6O13S6/c1-16-12-21(28-25-29-26...,MQNUWAMSQUIUHD-ULIFNZDWSA-N
7507,C_OMRMWDQPXBZWGS-UHFFFAOYSA-N,C_OMRMWDQPXBZWGS-UHFFFAOYSA-N,O=C(O)C=1C=CC=CC1N=NC2=C(O)C=CC=3C=CC=CC32,,,1.00,,Mutagenicity Positive,O=C(O)C=1C=CC=CC1N=NC2=C(O)C=CC=3C=CC=CC32,O=C(O)C1=CC=CC=C1N=NC1=C(O)C=CC2=CC=CC=C21,InChI=1S/C17H12N2O3/c20-15-10-9-11-5-1-2-6-12(...,OMRMWDQPXBZWGS-VHEBQXMUSA-N
7508,1135682-32-4,1135682-32-4,O=S(=O)(O)C=1C=2C=CC=CC2C=C(C1CCCCCCCCC)CCCCCCCCC,,,-0.01,,Mutagenicity Negative,O=S(=O)(O)C=1C=2C=CC=CC2C=C(C1CCCCCCCCC)CCCCCCCCC,CCCCCCCCCC1=C(CCCCCCCCC)C(S(=O)(=O)O)=C2C=CC=C...,InChI=1S/C28H44O3S/c1-3-5-7-9-11-13-15-19-24-2...,WDNQRCVBPNOTNV-UHFFFAOYSA-N
7509,C_JUUQHENXZBKKIF-UHFFFAOYSA-N,C_JUUQHENXZBKKIF-UHFFFAOYSA-N,O=S(=O)(O)C1=CC(=C(Cl)C=C1N=NC2=C(O)C=CC=3C=CC...,,,0.46,,Mutagenicity Negative,O=S(=O)(O)C1=CC(=C(Cl)C=C1N=NC2=C(O)C=CC=3C=CC...,CC1=C(Cl)C=C(N=NC2=C(O)C=CC3=CC=CC=C32)C(S(=O)...,"InChI=1S/C17H13ClN2O4S/c1-10-8-16(25(22,23)24)...",JUUQHENXZBKKIF-FMQUCBEESA-N


In [470]:
ids2 = [get_key(e) for e in TEST_inchi2['QSAR_READY_SMILES (RDKit Mol) (InChI Key)'] ]

In [473]:
len(ids2)

7511

In [474]:
TEST_inchi2['ids'] = ids2

In [476]:
TEST_inchi2.to_csv(interim_dat_dir+'TEST_predictions.csv')

Well this turned out to be a complete disaster! as the Inchi Keys were not necessarily corresponding to the CAS identifiers. Checked ~100 manually - conflicts very based on structural isomers or salt forms. Have run the supplied CAS number and matched them to DTXSIDs. This is reported as a new column for the DTXSID for the QSAR READY form of structure processed.

In [481]:
TEST_preds = pd.read_csv(interim_dat_dir+'TEST_predictions.csv')

In [489]:
TEST_preds.head()

Unnamed: 0.1,Unnamed: 0,ID,Query,SmilesRan,Error,Exp_Value,Pred_Value,Exp_Result,Pred_Result,QSAR_READY_SMILES (RDKit Mol),QSAR_READY_SMILES (RDKit Mol) (Kekulized),DTXSID_of_chemical_structure_predicted,DTXSID_of_genetox_chemical
0,0,1086-00-6,1086-00-6,ClCC1=CC=C2C=CC3=CC=CC=4C=CC1=C2C34,,1.0,0.79,Mutagenicity Positive,Mutagenicity Positive,ClCC1=CC=C2C=CC3=CC=CC=4C=CC1=C2C34,ClCC1=CC=C2C=CC3=CC=CC4=CC=C1C2=C34,DTXSID00148597,DTXSID00148597
1,1,131657-78-8,131657-78-8,O=[N+]([O-])C1=CC(Cl)=C(O)C(=C1)NCC,,,0.53,,Mutagenicity Positive,O=[N+]([O-])C1=CC(Cl)=C(O)C(=C1)NCC,CCNC1=CC([N+](=O)[O-])=CC(Cl)=C1O,DTXSID00157169,DTXSID00157169
2,2,13345-26-1,13345-26-1,OC=1C=CC2=C(C1)C=C3C=CC4=CC=CC=5C=CC2=C3C45,,1.0,1.03,Mutagenicity Positive,Mutagenicity Positive,OC=1C=CC2=C(C1)C=C3C=CC4=CC=CC=5C=CC2=C3C45,OC1=CC=C2C(=C1)C=C1C=CC3=CC=CC4=CC=C2C1=C34,DTXSID00158116,DTXSID00158116
3,3,13501-76-3,13501-76-3,ClCCC[Si](OCC)(OCC)C,,,0.08,,Mutagenicity Negative,ClCCC[Si](OCC)(OCC)C,CCO[Si](C)(CCCCl)OCC,DTXSID00159204,DTXSID00159204
4,4,66376-36-1,66376-36-1,O=P(O)(O)C(O)(CCCN)P(=O)(O)O,,,0.38,,Mutagenicity Negative,O=P(O)(O)C(O)(CCCN)P(=O)(O)O,NCCCC(O)(P(=O)(O)O)P(=O)(O)O,DTXSID5022568,DTXSID00160734


In [491]:
my_id = []
for x,y in zip(TEST_preds['DTXSID_of_genetox_chemical'], TEST_preds['DTXSID_of_chemical_structure_predicted']):
    if y == '-':
        my_id.append(x)
    else:
        my_id.append(y)

In [493]:
TEST_preds['id']  = my_id

Trying through a loop and then through the apply command

In [497]:
TEST_preds['ids'] = TEST_preds.apply(lambda x: x['DTXSID_of_genetox_chemical'] if x['DTXSID_of_chemical_structure_predicted'] == '-' else x['DTXSID_of_chemical_structure_predicted'], axis = 1)

In [499]:
TEST_preds.drop(['DTXSID_of_chemical_structure_predicted', 'id'], axis = 1, inplace = True)

In [501]:
TEST_preds.rename(columns = {'ids': 'DTXSID_of_chemical_structure_predicted'}, inplace = True)

In [506]:
TEST_preds_df = TEST_preds[['DTXSID_of_genetox_chemical','DTXSID_of_chemical_structure_predicted',  'Error', 'Exp_Value',
       'Pred_Value', 'Exp_Result', 'Pred_Result','ID', 'Query', 'SmilesRan',
       'QSAR_READY_SMILES (RDKit Mol)',
       'QSAR_READY_SMILES (RDKit Mol) (Kekulized)']]

In [538]:
TEST_preds_df['TEST_prediction'] = TEST_preds_df.apply(lambda x: 1 if x['Pred_Value'] >= 0.5 else (np.nan if x['Error'] in ['FindPaths', 'Molecule contains unsupported element',
       'Only one nonhydrogen atom'] else 0 ),axis = 1 )

In [539]:
TEST_preds_df = TEST_preds_df[['DTXSID_of_genetox_chemical', 'DTXSID_of_chemical_structure_predicted',
       'Error', 'Exp_Value', 'Pred_Value', 'Exp_Result', 'Pred_Result','TEST_prediction', 'ID',
       'Query', 'SmilesRan', 'QSAR_READY_SMILES (RDKit Mol)',
       'QSAR_READY_SMILES (RDKit Mol) (Kekulized)' ]]

In [540]:
TEST_preds_df[~TEST_preds_df['Error'].isnull()]

Unnamed: 0,DTXSID_of_genetox_chemical,DTXSID_of_chemical_structure_predicted,Error,Exp_Value,Pred_Value,Exp_Result,Pred_Result,TEST_prediction,ID,Query,SmilesRan,QSAR_READY_SMILES (RDKit Mol),QSAR_READY_SMILES (RDKit Mol) (Kekulized)
37,DTXSID00195884,DTXSID00195884,FindPaths,,,,,,436-40-8,436-40-8,O=C1C(OCCC)=C(C(=O)C(OCCC)=C1N2CC2)N3CC3,O=C1C(OCCC)=C(C(=O)C(OCCC)=C1N2CC2)N3CC3,CCCOC1=C(N2CC2)C(=O)C(OCCC)=C(N2CC2)C1=O
69,DTXSID00203771,DTXSID00203771,FindPaths,,,,,,55258-21-4,55258-21-4,O=C(OCCCCCCCCCCCCCCCCCC)CCC(NC(=O)CCCCCCCCCCC)...,O=C(OCCCCCCCCCCCCCCCCCC)CCC(NC(=O)CCCCCCCCCCC)...,CCCCCCCCCCCCCCCCCCOC(=O)CCC(NC(=O)CCCCCCCCCCC)...
83,DTXSID0020571,DTXSID0020571,FindPaths,,,,,,NOCAS_859206,NOCAS_859206,O=C1OC(CC)C(O)(C)C(O)C(C(=O)C(C)CC(O)(C)C(OC2O...,O=C1OC(CC)C(O)(C)C(O)C(C(=O)C(C)CC(O)(C)C(OC2O...,CCC1OC(=O)C(C)C(OC2CC(C)(OC)C(O)C(C)O2)C(C)C(O...
94,DTXSID0020656,DTXSID0020656,FindPaths,,,,,,NOCAS_859101,NOCAS_859101,O=C(O)C1C2C3(OC(=O)C2(C)C(O)C=C3)C4CCC5(O)C(=C...,O=C(O)C1C2C3(OC(=O)C2(C)C(O)C=C3)C4CCC5(O)C(=C...,C=C1CC23CC1(O)CCC2C12C=CC(O)C(C)(C(=O)O1)C2C3C...
102,DTXSID0020787,DTXSID0020787,FindPaths,,,,,,NOCAS_860271,NOCAS_860271,O=C1C2=C(O)C=C(C(O)=C2C(=O)C34C1C(=O)C5C(O)C4C...,O=C1C2=C(O)C=C(C(O)=C2C(=O)C34C1C(=O)C5C(O)C4C...,CC1=CC(O)=C2C(=O)C3C(=O)C4C(O)C5C6C(O)C(C(=O)C...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7463,DTXSID90884098,DTXSID90884098,FindPaths,,,,,,NOCAS_863432,NOCAS_863432,C=1C=CC(=CC1)C=2C(=CC=C3N(C=4C=CC=CC4C3(C)C)C)...,C=1C=CC(=CC1)C=2C(=CC=C3N(C=4C=CC=CC4C3(C)C)C)...,CN1C2=CC=CC=C2C(C)(C)C1=CC=C1C2=CC=CC=C2[N+](C...
7492,DTXSID90889245,DTXSID90889245,FindPaths,,,,,,932742-30-8,932742-30-8,O=C(OCC(C=NCC1(C)CC(N=CC(C)(C)COC(=O)CCCCCCCCC...,O=C(OCC(C=NCC1(C)CC(N=CC(C)(C)COC(=O)CCCCCCCCC...,CCCCCCCCCCCC(=O)OCC(C)(C)C=NCC1(C)CC(N=CC(C)(C...
7499,DTXSID90889467,DTXSID4064634,FindPaths,,,,,,7342-13-4,7342-13-4,O=S(=O)(O)C1=CC(=CC=C1C=CC2=CC=C(C=C2S(=O)(=O)...,O=S(=O)(O)C1=CC(=CC=C1C=CC2=CC=C(C=C2S(=O)(=O)...,COC1=NC(NC2=CC=CC=C2)=NC(NC2=CC=C(C=CC3=CC=C(N...
7502,DTXSID90890058,DTXSID90890058,FindPaths,,,,,,C_YEMFTKQHJDLOQR-UHFFFAOYSA-N,C_YEMFTKQHJDLOQR-UHFFFAOYSA-N,O=C(NC1=CC=C(N=NC2=CC=C(N=NC3=CC=C(C=C3)S(=O)(...,O=C(NC1=CC=C(N=NC2=CC=C(N=NC3=CC=C(C=C3)S(=O)(...,CC(O)=NC1=CC(NC(=O)C2=CC=C(C(=O)NC3=CC=C(N=NC4...


In [719]:
TEST_preds_df.TEST_prediction.value_counts(dropna = False)

0.0    5286
1.0    1961
NaN     264
Name: TEST_prediction, dtype: int64

In [543]:
TEST_preds_df.to_csv(processed_dat_dir+'TEST_consensus_Ames_predictions_100221.csv')

#### Processing and reviewing the predictions from the OECD Toolbox. Version 4.4.1 was used given that the dataset was being augmented. It made sense to use the most up-to-date version of the tool.

In [577]:
OECD_TB = pd.read_excel(interim_dat_dir+'gentox_TB_out.xlsx')

In [578]:
OECD_TB.shape

(6955, 17)

In [583]:
OECD_TB.columns

Index(['#', 'CAS Number', 'Chemical name(s)', 'SMILES', 'Molecular formula',
       'Predefined substance type', 'Additional Ids', 'Composition',
       'CAS-SMILES relation',
       'Carcinogenicity (genotox and nongenotox) alerts by ISS',
       'DNA alerts for AMES, CA and MNT by OASIS', 'DNA binding by OASIS',
       'DNA binding by OECD', 'Oncologic Primary Classification',
       'Protein binding alerts for Chromosomal aberration by OASIS',
       'in vitro mutagenicity (Ames test) alerts by ISS',
       'in vivo mutagenicity (Micronucleus) alerts by ISS'],
      dtype='object')

In [584]:
OECD_TB = OECD_TB[['Chemical name(s)', 'SMILES', 'Molecular formula',
       'Predefined substance type', 'Additional Ids', 'Composition',
       'CAS-SMILES relation',
       'Carcinogenicity (genotox and nongenotox) alerts by ISS',
       'DNA alerts for AMES, CA and MNT by OASIS', 'DNA binding by OASIS',
       'DNA binding by OECD', 'Oncologic Primary Classification',
       'Protein binding alerts for Chromosomal aberration by OASIS',
       'in vitro mutagenicity (Ames test) alerts by ISS',
       'in vivo mutagenicity (Micronucleus) alerts by ISS']]

In [595]:
OECD_TB = OECD_TB[~OECD_TB['Chemical name(s)'].isnull()]

In [596]:
OECD_TB['Chemical name(s)']

0                   DTXSID00148597
1                   DTXSID00157169
2                   DTXSID00158116
3                   DTXSID00159204
4       Alendronate;DTXSID00160734
                   ...            
6947                DTXSID90890477
6948                DTXSID90890619
6949                DTXSID90890811
6950                DTXSID90891080
6951                DTXSID90891888
Name: Chemical name(s), Length: 6952, dtype: object

In [597]:
import re
p = re.compile(r'DTXSID\d{1,}')

In [598]:
dtxsid = [m.group(0) for l in OECD_TB['Chemical name(s)'] for m in [p.search(l)] if m]



In [605]:
OECD_TB['DTXSID'] =dtxsid

In [600]:
missing_dtxsids = OECD_TB[~OECD_TB['Chemical name(s)'].str.contains('DTX')]

In [602]:
missing_dtxsids

Unnamed: 0,Chemical name(s),SMILES,Molecular formula,Predefined substance type,Additional Ids,Composition,CAS-SMILES relation,Carcinogenicity (genotox and nongenotox) alerts by ISS,"DNA alerts for AMES, CA and MNT by OASIS",DNA binding by OASIS,DNA binding by OECD,Oncologic Primary Classification,Protein binding alerts for Chromosomal aberration by OASIS,in vitro mutagenicity (Ames test) alerts by ISS,in vivo mutagenicity (Micronucleus) alerts by ISS,DTXSID


In [608]:
OECD_TB = OECD_TB[['DTXSID','Carcinogenicity (genotox and nongenotox) alerts by ISS',
       'DNA alerts for AMES, CA and MNT by OASIS', 'DNA binding by OASIS',
       'DNA binding by OECD', 'Oncologic Primary Classification',
       'Protein binding alerts for Chromosomal aberration by OASIS',
       'in vitro mutagenicity (Ames test) alerts by ISS',
       'in vivo mutagenicity (Micronucleus) alerts by ISS']]


In [624]:
OECD_TB.to_csv(processed_dat_dir+'OECD_TB_100221.csv')

In [618]:
tag = {'Carcinogenicity (genotox and nongenotox) alerts by ISS' : 'Carc_ISS',
       'DNA alerts for AMES, CA and MNT by OASIS' : 'DNA_Ames_OASIS', 'DNA binding by OASIS' : 'DNA_binding_OASIS',
       'DNA binding by OECD' : 'DNA_binding_OECD' , 'Oncologic Primary Classification': 'Oncologic',
       'Protein binding alerts for Chromosomal aberration by OASIS' : 'Protein_binding_CA_OASIS',
       'in vitro mutagenicity (Ames test) alerts by ISS' :'Ames_ISS' ,
       'in vivo mutagenicity (Micronucleus) alerts by ISS': 'MNT_ISS'}

In [620]:
OECD_TB.rename(columns = tag, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [610]:
OECD_TB_numeric = OECD_TB.replace({'No alert found': 0, '(N/A)': np.nan, 'Not classified': 0})

In [614]:
OECD_TB_numeric.iloc[:,1:].replace({'\w+' : 1}, regex = True, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [621]:
OECD_TB_numeric.rename(columns = tag, inplace = True)

In [715]:
OECD_TB.columns

Index(['DTXSID', 'Carc_ISS', 'DNA_Ames_OASIS', 'DNA_binding_OASIS',
       'DNA_binding_OECD', 'Oncologic', 'Protein_binding_CA_OASIS', 'Ames_ISS',
       'MNT_ISS'],
      dtype='object')

In [725]:
OECD_TB.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6952 entries, 0 to 6951
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   DTXSID                    6952 non-null   object
 1   Carc_ISS                  6952 non-null   object
 2   DNA_Ames_OASIS            6952 non-null   object
 3   DNA_binding_OASIS         6952 non-null   object
 4   DNA_binding_OECD          6952 non-null   object
 5   Oncologic                 6952 non-null   object
 6   Protein_binding_CA_OASIS  6952 non-null   object
 7   Ames_ISS                  6952 non-null   object
 8   MNT_ISS                   6952 non-null   object
dtypes: object(9)
memory usage: 543.1+ KB


In [623]:
OECD_TB_numeric.to_csv(processed_dat_dir+'OECD_TB_numeric_100221.csv')

#### Model predictions from VEGA - cite http://ceur-ws.org/Vol-1107/paper8.pdf Version 1.1..5.47 was used to generate new predictions following suggestion from reviewer.

In [631]:
VEGA_out = pd.read_excel(interim_dat_dir+'VEGA_OUT.xlsx')

In [637]:
VEGA_out.iloc[10,:].tolist()

['No.',
 'Id',
 'SMILES',
 'Mutagenicity (Ames test) CONSENSUS model - assessment',
 'Mutagenicity (Ames test) CONSENSUS model - prediction',
 'Mutagenicity (Ames test) model (CAESAR) - assessment',
 'Mutagenicity (Ames test) model (CAESAR) - prediction',
 'Mutagenicity (Ames test) model (SarPy/IRFMN) - assessment',
 'Mutagenicity (Ames test) model (SarPy/IRFMN) - prediction',
 'Mutagenicity (Ames test) model (ISS) - assessment',
 'Mutagenicity (Ames test) model (ISS) - prediction',
 'Mutagenicity (Ames test) model (KNN/Read-Across) - assessment',
 'Mutagenicity (Ames test) model (KNN/Read-Across) - prediction',
 'Chromosomal aberration model (CORAL) - assessment',
 'Chromosomal aberration model (CORAL) - prediction',
 'In vitro Micronucleus activity (IRFMN/VERMEER) - assessment',
 'In vitro Micronucleus activity (IRFMN/VERMEER) - prediction',
 'In vivo Micronucleus activity (IRFMN) - assessment',
 'In vivo Micronucleus activity (IRFMN) - prediction']

In [638]:
VEGA_out.columns = VEGA_out.iloc[10,:].tolist()

In [643]:
VEGA_out = VEGA_out.iloc[12:,:]

In [644]:
VEGA_out.shape

(7511, 19)

In [646]:
VEGA_out.columns

Index(['No.', 'Id', 'SMILES',
       'Mutagenicity (Ames test) CONSENSUS model - assessment',
       'Mutagenicity (Ames test) CONSENSUS model - prediction',
       'Mutagenicity (Ames test) model (CAESAR) - assessment',
       'Mutagenicity (Ames test) model (CAESAR) - prediction',
       'Mutagenicity (Ames test) model (SarPy/IRFMN) - assessment',
       'Mutagenicity (Ames test) model (SarPy/IRFMN) - prediction',
       'Mutagenicity (Ames test) model (ISS) - assessment',
       'Mutagenicity (Ames test) model (ISS) - prediction',
       'Mutagenicity (Ames test) model (KNN/Read-Across) - assessment',
       'Mutagenicity (Ames test) model (KNN/Read-Across) - prediction',
       'Chromosomal aberration model (CORAL) - assessment',
       'Chromosomal aberration model (CORAL) - prediction',
       'In vitro Micronucleus activity (IRFMN/VERMEER) - assessment',
       'In vitro Micronucleus activity (IRFMN/VERMEER) - prediction',
       'In vivo Micronucleus activity (IRFMN) - assessme

In [650]:
VEGA_out.drop(['No.', 'SMILES'], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [652]:
VEGA_out.rename(columns = {'Id': 'dtxsid'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [654]:
VEGA_out.to_csv(processed_dat_dir+'VEGA_out_cleaned_100221.csv')

In [656]:
VEGA_out.columns

Index(['dtxsid', 'Mutagenicity (Ames test) CONSENSUS model - assessment',
       'Mutagenicity (Ames test) CONSENSUS model - prediction',
       'Mutagenicity (Ames test) model (CAESAR) - assessment',
       'Mutagenicity (Ames test) model (CAESAR) - prediction',
       'Mutagenicity (Ames test) model (SarPy/IRFMN) - assessment',
       'Mutagenicity (Ames test) model (SarPy/IRFMN) - prediction',
       'Mutagenicity (Ames test) model (ISS) - assessment',
       'Mutagenicity (Ames test) model (ISS) - prediction',
       'Mutagenicity (Ames test) model (KNN/Read-Across) - assessment',
       'Mutagenicity (Ames test) model (KNN/Read-Across) - prediction',
       'Chromosomal aberration model (CORAL) - assessment',
       'Chromosomal aberration model (CORAL) - prediction',
       'In vitro Micronucleus activity (IRFMN/VERMEER) - assessment',
       'In vitro Micronucleus activity (IRFMN/VERMEER) - prediction',
       'In vivo Micronucleus activity (IRFMN) - assessment',
       'In vivo

In [664]:
[e for e in VEGA_out.columns if 'prediction'   in e]

['Mutagenicity (Ames test) CONSENSUS model - prediction',
 'Mutagenicity (Ames test) model (CAESAR) - prediction',
 'Mutagenicity (Ames test) model (SarPy/IRFMN) - prediction',
 'Mutagenicity (Ames test) model (ISS) - prediction',
 'Mutagenicity (Ames test) model (KNN/Read-Across) - prediction',
 'Chromosomal aberration model (CORAL) - prediction',
 'In vitro Micronucleus activity (IRFMN/VERMEER) - prediction',
 'In vivo Micronucleus activity (IRFMN) - prediction']

In [667]:
VEGA_pred = VEGA_out[['dtxsid']+[e for e in VEGA_out.columns if 'prediction'   in e]]

In [689]:
poss_outcomes = []
for e in VEGA_pred.columns.tolist()[1:]:
    poss_outcomes.append(VEGA_pred[e].unique().tolist())

In [694]:
list(set([i for e in poss_outcomes for i in e]))

['Active',
 'Possible NON-Mutagenic',
 'Suspect Mutagenic',
 'NON-Mutagenic',
 'Inactive',
 'Not predicted',
 'Mutagenic',
 'Non Predicted',
 'Genotoxic',
 '-',
 'NON-genotoxic']

In [696]:
vega_calls = {'Active' : 1,
 'Possible NON-Mutagenic': 0,
 'Suspect Mutagenic': 1,
 'NON-Mutagenic':0,
 'Inactive':0,
 'Not predicted': np.nan,
 'Mutagenic':1,
 'Non Predicted':np.nan,
 'Genotoxic':1,
 '-':np.nan,
 'NON-genotoxic':0}

In [700]:
VEGA_pred_numeric = VEGA_pred.replace(vega_calls)

In [716]:
VEGA_pred_numeric.columns

Index(['dtxsid', 'Mutagenicity (Ames test) CONSENSUS model - prediction',
       'Mutagenicity (Ames test) model (CAESAR) - prediction',
       'Mutagenicity (Ames test) model (SarPy/IRFMN) - prediction',
       'Mutagenicity (Ames test) model (ISS) - prediction',
       'Mutagenicity (Ames test) model (KNN/Read-Across) - prediction',
       'Chromosomal aberration model (CORAL) - prediction',
       'In vitro Micronucleus activity (IRFMN/VERMEER) - prediction',
       'In vivo Micronucleus activity (IRFMN) - prediction'],
      dtype='object')

In [702]:
VEGA_pred_numeric.to_csv(processed_dat_dir+'VEGA_pred_numeric_100221.csv')

In [724]:
VEGA_pred_numeric.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7511 entries, 12 to 7522
Data columns (total 9 columns):
 #   Column                                                         Non-Null Count  Dtype  
---  ------                                                         --------------  -----  
 0   dtxsid                                                         7511 non-null   object 
 1   Mutagenicity (Ames test) CONSENSUS model - prediction          7511 non-null   int64  
 2   Mutagenicity (Ames test) model (CAESAR) - prediction           7510 non-null   float64
 3   Mutagenicity (Ames test) model (SarPy/IRFMN) - prediction      7511 non-null   int64  
 4   Mutagenicity (Ames test) model (ISS) - prediction              7511 non-null   int64  
 5   Mutagenicity (Ames test) model (KNN/Read-Across) - prediction  7499 non-null   float64
 6   Chromosomal aberration model (CORAL) - prediction              7511 non-null   int64  
 7   In vitro Micronucleus activity (IRFMN/VERMEER) - prediction 

But there are still 1299 missing ids. Need to go back to the Dashboard to figure out what the missing CAS and INCHI Keys are to determine the DTXSIDs

In [460]:
TEST_out[TEST_out['ids'] == 'missing_id'].to_csv(interim_dat_dir+'mssing_ids_tst.csv')

In [458]:
TEST_out[TEST_out['Query'] == 'MQNUWAMSQUIUHD-UHFFFAOYSA-N']

Unnamed: 0,Index,ID,Query,SmilesRan,Error,Exp_Value,Pred_Value,Exp_Result,Pred_Result,ids
7506,7507,C_MQNUWAMSQUIUHD-UHFFFAOYSA-N,MQNUWAMSQUIUHD-UHFFFAOYSA-N,O=S(=O)(O)C1=CC=CC=2C1=CC(N=NC=3C=C(OC)C(=CC3C...,,,0.23,,Mutagenicity Negative,missing_id


In [466]:
genetox_wsmi_inchi[genetox_wsmi_inchi.DTXSID == 'DTXSID5022568']

Unnamed: 0,DTXSID,PREFERRED_NAME,CASRN,SMILES,QSAR_READY_SMILES,InChI Key,combined


#### Trying to extract the WebTest predictions from the Log file due to Batch processing errors

In [34]:
file1 = open(interim_dat_dir+'webtest.log', 'r')
lines = file1.readlines()

In [58]:
mylst = []
for l in lines:
    if 'DEBUG' in l:
        continue
    else:
        mylst.append(l)

In [91]:
mylst[2]

'[INFO ] 2021-02-09 11:18:50.161 [Thread-4] WebTEST4 - 1086-00-6\t1.000\t0.788\t\t11\t125\tMutagenicity\n'

In [64]:
import re

In [119]:
regex = re.compile('\[INFO\s{1}\]\s{1}\d{4}-\d{2}-\d{2}\s{1}\d{2}:\d{2}:\d{2}\.\d{3}\s{1}\[Thread-4\]\s{1}WebTEST4\s{1}-\s{1}')

In [129]:
def regex_strip(mainstring):
    regex = re.compile('\[INFO\s{1}\]\s{1}\d{4}-\d{2}-\d{2}\s{1}\d{2}:\d{2}:\d{2}\.\d{3}\s{1}\[Thread-4\]\s{1}WebTEST4\s{1}-\s{1}')
    return regex.sub('', mainstring)

In [131]:
regex_strip('[INFO ] 2021-02-09 11:18:50.161 [Thread-4] WebTEST4 - 1086-00-6\t1.000\t0.788\t\t11\t125\tMutagenicity\n')

'1086-00-6\t1.000\t0.788\t\t11\t125\tMutagenicity\n'

In [132]:
new_lst = [regex_strip(e) for e in mylst]

In [134]:
new_lst[1]

"Calculating '[Mutagenicity]' using '[Consensus]' methods...\n"

In [135]:
def regex_strip2(mainstring):
    regex2 = re.compile('Calculating\s{1}\'\[Mutagenicity\]\'\s{1}using\s{1}\'\[Consensus\]\'\s{1}methods')
    return regex2.sub('', mainstring)

In [137]:
new_lst2 = [regex_strip2(e) for e in new_lst]

In [146]:
new_lst3 = [e.split('\n') for e in new_lst2]

In [152]:
[e for e in new_lst3 if not ['...', '']]

[]

In [159]:

lst_5 = [e for e in new_lst if '...' not in e]
        

In [163]:
lst_5 = [e.split('\n') for e in lst_5][1:]

In [189]:
new = []
for l in lst_5:
    new.append([x.split(',') for x in l])


In [192]:
new[0][0]

['1086-00-6\t1.000\t0.788\t\t11\t125\tMutagenicity']

In [199]:
test_df = pd.DataFrame([e[0].split('\t') for e in lst_5])

In [203]:
test_df.columns = ['CAS', 'Expt', 'Pred', 0,1,2,3]

In [207]:
test_df.drop([0,1,2,3], axis = 1, inplace = True)

In [213]:
test_df = test_df.drop_duplicates(keep = 'first')

In [225]:
missing = test_df[test_df['CAS'].str.contains('ERROR')]['CAS'].tolist()

In [226]:
missing[0]

'[ERROR] 2021-02-09 11:19:01.851 [Thread-4] WebTEST4 - Error processing record with CAS 37574-48-4: FindPaths'

In [227]:
def regex_strip3(mainstring):
    regex3 = re.compile('\[ERROR\]\s{1}\d{4}-\d{2}-\d{2}\s{1}\d{2}:\d{2}:\d{2}\.\d{3}\s{1}\[Thread-4\]\s{1}WebTEST4\s{1}-\s{1}Error processing record with CAS\s{1}')
    return regex3.sub('', mainstring)

In [229]:
missing = [regex_strip3(e) for e in missing]

In [230]:
missing[0]

'37574-48-4: FindPaths'

In [231]:
def regex_strip4(mainstring):
    regex4 = re.compile('\:\s{1}FindPaths')
    return regex4.sub('', mainstring)

In [233]:
missing_ids = [regex_strip4(e) for e in missing]

In [218]:
TEST_df = pd.merge(gentox_wsmi, test_df, left_on = 'CASRN', right_on = 'CAS', how = 'left')

In [249]:
missing_ids[3]

'NOCAS_859101'

In [242]:
TEST_df['error'] = TEST_df['CASRN'].apply(lambda x: 1 if x in missing_ids else 0)

In [255]:
TEST_df[TEST_df['error'] ==1 ]

Unnamed: 0,DTXSID,PREFERRED_NAME,CASRN,SMILES,QSAR_READY_SMILES,CAS,Expt,Pred,error
34,DTXSID00191008,4-Hydroxybenzo(a)pyrene,37574-48-4,OC1=CC2=CC3=CC=CC=C3C3=C2C2=C(C=CC=C12)C=C3,OC1=CC2=CC3=CC=CC=C3C3=C2C2=C(C=CC=C12)C=C3,,,,1
66,DTXSID0020365,Cyclosporin A,59865-13-3,[H][C@@]1([C@H](O)[C@H](C)C\C=C\C)N(C)C(=O)[C@...,CCC1NC(=O)C(C(O)C(C)CC=CC)N(C)C(=O)C(C(C)C)N(C...,,,,1
92,DTXSID0020650,Gallic acid,149-91-7,OC(=O)C1=CC(O)=C(O)C(O)=C1,OC(=O)C1=CC(O)=C(O)C(O)=C1,149-91-7,0.000,0.143,1
116,DTXSID00209617,"Benz(a)anthracene, 1,7,12-trimethyl-",60786-51-8,CC1=C2C(C=CC3=C(C)C4=CC=CC=C4C(C)=C23)=CC=C1,CC1=C2C(C=CC3=C(C)C4=CC=CC=C4C(C)=C23)=CC=C1,60786-51-8,,0.397,1
230,DTXSID0023955,Chlorimuron-ethyl,90982-32-4,CCOC(=O)C1=CC=CC=C1S(=O)(=O)NC(=O)NC1=NC(OC)=C...,CCOC(=O)C1=CC=CC=C1S(=O)(=O)NC(=O)NC1=NC(OC)=C...,90982-32-4,,0.301,1
...,...,...,...,...,...,...,...,...,...
7438,DTXSID90863747,"Butanamide, 2,2'-[(3,3'-dichloro[1,1'-biphenyl...",6358-37-8,CC(=O)C(N=NC1=CC=C(C=C1Cl)C1=CC(Cl)=C(C=C1)N=N...,CC(=O)C(N=NC1=CC=C(C=C1Cl)C1=CC(Cl)=C(C=C1)N=N...,6358-37-8,,0.212,1
7445,DTXSID90867922,"Spiro[isobenzofuran-1(3H),9'-[9H]xanthen]-3-on...",70516-41-5,CCN(CCC(C)C)C1=CC=C2C(OC3=C(C=C(NC4=CC=CC=C4)C...,CCN(CCC(C)C)C1=CC=C2C(OC3=C(C=C(NC4=CC=CC=C4)C...,70516-41-5,,0.108,1
7448,DTXSID90872819,"1,6-Dinitropyrene",42397-64-8,[O-][N+](=O)C1=CC=C2C=CC3=C(C=CC4=CC=C1C2=C34)...,[O-][N+](=O)C1=CC=C2C=CC3=C(C=CC4=CC=C1C2=C34)...,42397-64-8,1.000,0.859,1
7450,DTXSID90874954,TENULIN,19202-92-7,CC1CC2OC(=O)C3(C)C2C(OC3(C)O)C2(C)C1C=CC2=O,CC1CC2OC(=O)C3(C)C2C(OC3(C)O)C2(C)C1C=CC2=O,19202-92-7,0.000,0.150,1
