# AMES

Dataset Description: Mutagenicity means the ability of a drug to induce genetic alterations. Drugs that can cause damage to the DNA can result in cell death or other severe adverse effects. Nowadays, the most widely used assay for testing the mutagenicity of compounds is the Ames experiment which was invented by a professor named Ames. The Ames test is a short-term bacterial reverse mutation assay detecting a large number of compounds which can induce genetic damage and frameshift mutations. The dataset is aggregated from four papers.

Task Description: Binary classification. Given a drug SMILES string, predict whether it is mutagenic (1) or not mutagenic (0).

Dataset Statistics: 7,255 drugs.

In [90]:
from tdc.single_pred import Tox
from rdkit import Chem
import networkx as nx
import numpy as np
data = Tox(name = 'AMES')
split = data.get_split()

Found local copy...
Loading...
Done!


In [40]:
split['test']

Unnamed: 0,Drug_ID,Drug,Y
0,Drug 6060,Cc1cc(=O)oc2ccc3c(c12)C1(C)OOC1(C)O3,1
1,Drug 6703,CN(CC(O)CO)N=O,1
2,Drug 6827,O=[N+]([O-])c1ccc(OC2CO2)cc1,1
3,Drug 3689,Cn1cnc2cc([N+](=O)[O-])ccc21,0
4,Drug 4516,CC[C@H](CO)NCCN[C@H](CC)CO,0
...,...,...,...
1451,Drug 2876,O=CCCl,1
1452,Drug 325,Cc1cccc(C(=O)Cl)c1,1
1453,Drug 2206,Cc1ccc2cc3ccccc3cc2c1,1
1454,Drug 290,C/C=C(\C)CCl,1


In [41]:
split['train']

Unnamed: 0,Drug_ID,Drug,Y
0,Drug 1,O=[N+]([O-])c1c2c(c3ccc4cccc5ccc1c3c45)CCCC2,1
1,Drug 2,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,0
2,Drug 3,[N-]=[N+]=CC(=O)NCC(=O)NN,1
3,Drug 4,[N-]=[N+]=C1C=NC(=O)NC1=O,1
4,Drug 6,CCCCN(CC(O)C1=CC(=[N+]=[N-])C(=O)C=C1)N=O,1
...,...,...,...
5089,Drug 7568,CCC(CCC(C)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C)C(C)C,0
5090,Drug 7587,CCCCCCCCCCCCOCCO,0
5091,Drug 7593,CCOP(=S)(CC)Sc1ccccc1,0
5092,Drug 7598,C=C(C)C1CC=C(C)C(OC(C)=O)C1,0


In [148]:
def mol_to_nx(mol):
    G = nx.Graph()

    for atom in mol.GetAtoms():
        G.add_node(atom.GetIdx(),
                   atomic_num=atom.GetAtomicNum(),
                   formal_charge=atom.GetFormalCharge(),
                   chiral_tag=atom.GetChiralTag(),
                   hybridization=atom.GetHybridization(),
                   num_explicit_hs=atom.GetNumExplicitHs(),
                   is_aromatic=atom.GetIsAromatic())
    for bond in mol.GetBonds():
        G.add_edge(bond.GetBeginAtomIdx(),
                   bond.GetEndAtomIdx(),
                   bond_type=bond.GetBondType())
    return G

def nx_to_mol(G):
    mol = Chem.RWMol()
    atomic_nums = nx.get_node_attributes(G, 'atomic_num')
    chiral_tags = nx.get_node_attributes(G, 'chiral_tag')
    formal_charges = nx.get_node_attributes(G, 'formal_charge')
    node_is_aromatics = nx.get_node_attributes(G, 'is_aromatic')
    node_hybridizations = nx.get_node_attributes(G, 'hybridization')
    num_explicit_hss = nx.get_node_attributes(G, 'num_explicit_hs')
    node_to_idx = {}
    for node in G.nodes():
        a=Chem.Atom(atomic_nums[node])
        a.SetChiralTag(chiral_tags[node])
        a.SetFormalCharge(formal_charges[node])
        a.SetIsAromatic(node_is_aromatics[node])
        a.SetHybridization(node_hybridizations[node])
        a.SetNumExplicitHs(num_explicit_hss[node])
        idx = mol.AddAtom(a)
        node_to_idx[node] = idx

    bond_types = nx.get_edge_attributes(G, 'bond_type')
    for edge in G.edges():
        first, second = edge
        ifirst = node_to_idx[first]
        isecond = node_to_idx[second]
        bond_type = bond_types[first, second]
        mol.AddBond(ifirst, isecond, bond_type)

    Chem.SanitizeMol(mol)
    return mol

def do_all(smiles, validate=False):
    list_of_smiles=[]
    
    for  smile in smiles:
        mol = Chem.MolFromSmiles(smile.strip())
        can_smi = Chem.MolToSmiles(mol)
        G = mol_to_nx(mol)
        
        if validate:
            mol = nx_to_mol(G)
            new_smi = Chem.MolToSmiles(mol)
            assert new_smi == smile
        list_of_smiles.append(G)
    return list_of_smiles

In [155]:
graphs=do_all(train_smiles[:4], validate=True)
graphs[0].edges(data=True)

EdgeDataView([(0, 1, {'bond_type': rdkit.Chem.rdchem.BondType.DOUBLE}), (1, 2, {'bond_type': rdkit.Chem.rdchem.BondType.SINGLE}), (1, 3, {'bond_type': rdkit.Chem.rdchem.BondType.SINGLE}), (3, 4, {'bond_type': rdkit.Chem.rdchem.BondType.AROMATIC}), (3, 16, {'bond_type': rdkit.Chem.rdchem.BondType.AROMATIC}), (4, 5, {'bond_type': rdkit.Chem.rdchem.BondType.AROMATIC}), (4, 22, {'bond_type': rdkit.Chem.rdchem.BondType.SINGLE}), (5, 6, {'bond_type': rdkit.Chem.rdchem.BondType.AROMATIC}), (5, 19, {'bond_type': rdkit.Chem.rdchem.BondType.SINGLE}), (6, 7, {'bond_type': rdkit.Chem.rdchem.BondType.AROMATIC}), (6, 17, {'bond_type': rdkit.Chem.rdchem.BondType.AROMATIC}), (7, 8, {'bond_type': rdkit.Chem.rdchem.BondType.AROMATIC}), (8, 9, {'bond_type': rdkit.Chem.rdchem.BondType.AROMATIC}), (9, 10, {'bond_type': rdkit.Chem.rdchem.BondType.AROMATIC}), (9, 18, {'bond_type': rdkit.Chem.rdchem.BondType.AROMATIC}), (10, 11, {'bond_type': rdkit.Chem.rdchem.BondType.AROMATIC}), (11, 12, {'bond_type': rdkit

In [131]:



G=do_all(train_smiles[4], validate=True)
G.edges(data='True')

EdgeDataView([(0, 1, None), (1, 2, None), (2, 3, None), (3, 4, None), (4, 5, None), (4, 17, None), (5, 6, None), (6, 7, None), (6, 8, None), (8, 9, None), (8, 16, None), (9, 10, None), (10, 11, None), (10, 13, None), (11, 12, None), (13, 14, None), (13, 15, None), (15, 16, None), (17, 18, None)])

In [123]:
def nx_to_mol(G):
    mol = Chem.RWMol()
    atomic_nums = nx.get_node_attributes(G, 'atomic_num')
    chiral_tags = nx.get_node_attributes(G, 'chiral_tag')
    formal_charges = nx.get_node_attributes(G, 'formal_charge')
    node_is_aromatics = nx.get_node_attributes(G, 'is_aromatic')
    node_hybridizations = nx.get_node_attributes(G, 'hybridization')
    num_explicit_hss = nx.get_node_attributes(G, 'num_explicit_hs')
    node_to_idx = {}
    for node in G.nodes():
        a=Chem.Atom(atomic_nums[node])
        a.SetChiralTag(chiral_tags[node])
        a.SetFormalCharge(formal_charges[node])
        a.SetIsAromatic(node_is_aromatics[node])
        a.SetHybridization(node_hybridizations[node])
        a.SetNumExplicitHs(num_explicit_hss[node])
        idx = mol.AddAtom(a)
        node_to_idx[node] = idx

    bond_types = nx.get_edge_attributes(G, 'bond_type')
    for edge in G.edges():
        first, second = edge
        ifirst = node_to_idx[first]
        isecond = node_to_idx[second]
        bond_type = bond_types[first, second]
        mol.AddBond(ifirst, isecond, bond_type)

    Chem.SanitizeMol(mol)
    return mol

In [115]:
def mol_to_nx(mol):
    G = nx.Graph()

    for atom in mol.GetAtoms():
        G.add_node(atom.GetIdx(),
                   atomic_num=atom.GetAtomicNum(),
                   formal_charge=atom.GetFormalCharge(),
                   chiral_tag=atom.GetChiralTag(),
                   hybridization=atom.GetHybridization(),
                   num_explicit_hs=atom.GetNumExplicitHs(),
                   is_aromatic=atom.GetIsAromatic())
    for bond in mol.GetBonds():
        G.add_edge(bond.GetBeginAtomIdx(),
                   bond.GetEndAtomIdx(),
                   bond_type=bond.GetBondType())
    return G

In [112]:
from pysmiles import read_smiles
mol = read_smiles(train_smiles[0])
# atom vector (C only)
mol.nodes(data='element')
# adjacency matrix
nx.to_numpy_matrix(mol)
elements = nx.get_node_attributes(mol, name = "element")

In [113]:
mol.nodes(data='True')

NodeDataView({0: None, 1: None, 2: None, 3: None, 4: None, 5: None, 6: None, 7: None, 8: None, 9: None, 10: None, 11: None, 12: None, 13: None, 14: None, 15: None, 16: None, 17: None, 18: None, 19: None, 20: None, 21: None, 22: None}, data='True')

In [103]:
list_of_smiles_to_list_of_graph_molecules(train_smiles[0])

44 44
1
2
4
5
6
7
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
43


[19:58:08] SMILES Parse Error: syntax error while parsing: =
[19:58:08] SMILES Parse Error: Failed parsing SMILES '=' for input: '='
[19:58:08] SMILES Parse Error: syntax error while parsing: [
[19:58:08] SMILES Parse Error: Failed parsing SMILES '[' for input: '['
[19:58:08] SMILES Parse Error: syntax error while parsing: +
[19:58:08] SMILES Parse Error: Failed parsing SMILES '+' for input: '+'
[19:58:08] SMILES Parse Error: syntax error while parsing: ]
[19:58:08] SMILES Parse Error: Failed parsing SMILES ']' for input: ']'
[19:58:08] SMILES Parse Error: syntax error while parsing: (
[19:58:08] SMILES Parse Error: Failed parsing SMILES '(' for input: '('
[19:58:08] SMILES Parse Error: syntax error while parsing: [
[19:58:08] SMILES Parse Error: Failed parsing SMILES '[' for input: '['
[19:58:08] SMILES Parse Error: syntax error while parsing: -
[19:58:08] SMILES Parse Error: Failed parsing SMILES '-' for input: '-'
[19:58:08] SMILES Parse Error: syntax error while parsing: ]
[19:58:0

[<networkx.classes.graph.Graph at 0x1f39f580af0>,
 'None',
 'None',
 <networkx.classes.graph.Graph at 0x1f39f580880>,
 'None',
 'None',
 'None',
 'None',
 <networkx.classes.graph.Graph at 0x1f39f5811e0>,
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 <networkx.classes.graph.Graph at 0x1f39f5805e0>,
 <networkx.classes.graph.Graph at 0x1f39f5810f0>,
 <networkx.classes.graph.Graph at 0x1f39f582d70>,
 <networkx.classes.graph.Graph at 0x1f39f5825c0>,
 'None']

In [91]:
targets=split['train']['Y']
train_smiles=list(split['train']['Drug'])

In [100]:

list_of_graph_molecules=list_of_smiles_to_list_of_graph_molecules(train_smiles)

5094 5094


In [94]:
from operator import itemgetter

neg_graphs_ids=list(targets[targets==1].index)#mutagenic
neg_graphs=list(itemgetter(*ids)(list_of_graph_molecules))
pos_graphs_ids=list(targets[targets==0].index)#(non_mutagenic)
pos_graphs=list(itemgetter(*ids)(list_of_graph_molecules))


In [101]:
pos_graphs[0].nodes(data='True')

NodeDataView({0: None, 1: None, 2: None, 3: None, 4: None, 5: None, 6: None, 7: None, 8: None, 9: None, 10: None, 11: None, 12: None, 13: None, 14: None, 15: None, 16: None, 17: None, 18: None, 19: None, 20: None, 21: None, 22: None, 23: None, 24: None, 25: None, 26: None, 27: None, 28: None, 29: None, 30: None, 31: None, 32: None, 33: None, 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: None, 46: None, 47: None, 48: None, 49: None}, data='True')

# Herg

hERG blockers
Dataset Description: Human ether-à-go-go related gene (hERG) is crucial for the coordination of the heart's beating. Thus, if a drug blocks the hERG, it could lead to severe adverse effects. Therefore, reliable prediction of hERG liability in the early stages of drug design is quite important to reduce the risk of cardiotoxicity-related attritions in the later development stages.

Task Description: Binary classification. Given a drug SMILES string, predict whether it blocks (1) or not blocks (0).

Dataset Statistics: 648 drugs.

In [23]:
from tdc.single_pred import Tox
data = Tox(name = 'hERG')
split = data.get_split()

Found local copy...
Loading...
Done!


In [24]:
split['test']

Unnamed: 0,Drug_ID,Drug,Y
0,NELFINAVIR,Cc1c(O)cccc1C(=O)N[C@@H](CSc1ccccc1)[C@H](O)C[...,1.0
1,WAY123398,CN(CCN(C)S(=O)(=O)c1ccc(NS(C)(=O)=O)cc1)c1nc2c...,1.0
2,ACEHYTISINE,C=C1C[C@]23C[C@H]4[C@H]5[C@]67C[C@@H](OC(C)=O)...,0.0
3,MOSAPRIDE,CCOc1cc(N)c(Cl)cc1C(=O)NC[C@@H]1C[NH+](Cc2ccc(...,1.0
4,"5-CHLORO-N-(2,4-DIMETHOXYBENZYL)-1-BENZOFURAN-...",COc1ccc(CNC(=O)c2cc3cc(Cl)ccc3o2)c(OC)c1,1.0
...,...,...,...
126,lumefantrine,CCCC[NH+](CCCC)CC(O)c1cc(Cl)cc2c1-c1ccc(Cl)cc1...,1.0
127,C(CO)N1C(=NN=N1)N,Nc1nnnn1CCO,0.0
128,IMIPRAMINE,C[NH+](C)CCCN1c2ccccc2CCc2ccccc21,1.0
129,JMC20066569-49,O=C(NC1CCN(Cc2cccc(F)c2)CC1)c1cc(=O)c2ccc(F)cc2o1,1.0


In [20]:
split['train']

Unnamed: 0,Drug_ID,Drug,Y
0,(2-Bromo-5-propoxyphenyl)-(2-hydroxy-4-methoxy...,CCCOc1ccc(Br)c(C(=O)c2ccc(OC)cc2O)c1,0
1,(2E)-2-Methyl-2-butenal,CC=C(C)C=O,0
2,(2-oxo-1-phenylpyrrolidin-3-yl)-Triphenylphosp...,O=C1C([PH](c2ccccc2)(c2ccccc2)c2ccccc2)CCN1c1c...,0
3,(4Z)-2-Methyl-6-methylene-4-octene,C=C(C=CCC(C)C)CC,0
4,"[4-(Ethoxymethyl)-2,6-dimethoxyphenyl]boronic ...",CCOCc1cc(OC)c(B(O)O)c(OC)c1,0
...,...,...,...
278,Diphenylcyclopropenone,O=c1c(-c2ccccc2)c1-c1ccccc1,1
279,"Disperse Violet 1 (1,4-diamino-9,10-anthracene...",Nc1ccc(N)c2c1C(=O)c1ccccc1C2=O,1
280,HC Red 13,Nc1ccc(N(CO)CCO)cc1[N+](=O)[O-],1
281,Propyl gallate,CCCOC(=O)c1cc(O)c(O)c(O)c1,1


# DILI 

DILI (Drug Induced Liver Injury)
Dataset Description: Drug-induced liver injury (DILI) is fatal liver disease caused by drugs and it has been the single most frequent cause of safety-related drug marketing withdrawals for the past 50 years (e.g. iproniazid, ticrynafen, benoxaprofen). This dataset is aggregated from U.S. FDA’s National Center for Toxicological Research.

Task Description: Binary classification. Given a drug SMILES string, predict whether it can cause liver injury (1) or not (0).

Dataset Statistics: 475 drugs.

In [16]:
from tdc.single_pred import Tox
data = Tox(name = 'DILI')
split = data.get_split()

Downloading...
100%|██████████| 26.7k/26.7k [00:00<00:00, 220kiB/s]
Loading...
Done!


In [19]:
split['train']

Unnamed: 0,Drug_ID,Drug,Y
0,(2-Bromo-5-propoxyphenyl)-(2-hydroxy-4-methoxy...,CCCOc1ccc(Br)c(C(=O)c2ccc(OC)cc2O)c1,0
1,(2E)-2-Methyl-2-butenal,CC=C(C)C=O,0
2,(2-oxo-1-phenylpyrrolidin-3-yl)-Triphenylphosp...,O=C1C([PH](c2ccccc2)(c2ccccc2)c2ccccc2)CCN1c1c...,0
3,(4Z)-2-Methyl-6-methylene-4-octene,C=C(C=CCC(C)C)CC,0
4,"[4-(Ethoxymethyl)-2,6-dimethoxyphenyl]boronic ...",CCOCc1cc(OC)c(B(O)O)c(OC)c1,0
...,...,...,...
278,Diphenylcyclopropenone,O=c1c(-c2ccccc2)c1-c1ccccc1,1
279,"Disperse Violet 1 (1,4-diamino-9,10-anthracene...",Nc1ccc(N)c2c1C(=O)c1ccccc1C2=O,1
280,HC Red 13,Nc1ccc(N(CO)CCO)cc1[N+](=O)[O-],1
281,Propyl gallate,CCCOC(=O)c1cc(O)c(O)c(O)c1,1


In [25]:

split['test']

Unnamed: 0,Drug_ID,Drug,Y
0,NELFINAVIR,Cc1c(O)cccc1C(=O)N[C@@H](CSc1ccccc1)[C@H](O)C[...,1.0
1,WAY123398,CN(CCN(C)S(=O)(=O)c1ccc(NS(C)(=O)=O)cc1)c1nc2c...,1.0
2,ACEHYTISINE,C=C1C[C@]23C[C@H]4[C@H]5[C@]67C[C@@H](OC(C)=O)...,0.0
3,MOSAPRIDE,CCOc1cc(N)c(Cl)cc1C(=O)NC[C@@H]1C[NH+](Cc2ccc(...,1.0
4,"5-CHLORO-N-(2,4-DIMETHOXYBENZYL)-1-BENZOFURAN-...",COc1ccc(CNC(=O)c2cc3cc(Cl)ccc3o2)c(OC)c1,1.0
...,...,...,...
126,lumefantrine,CCCC[NH+](CCCC)CC(O)c1cc(Cl)cc2c1-c1ccc(Cl)cc1...,1.0
127,C(CO)N1C(=NN=N1)N,Nc1nnnn1CCO,0.0
128,IMIPRAMINE,C[NH+](C)CCCN1c2ccccc2CCc2ccccc21,1.0
129,JMC20066569-49,O=C(NC1CCN(Cc2cccc(F)c2)CC1)c1cc(=O)c2ccc(F)cc2o1,1.0


# Skin Reaction


Dataset Description: Repetitive exposure to a chemical agent can induce an immune reaction in inherently susceptible individuals that leads to skin sensitization. The dataset used in this study was retrieved from the ICCVAM (Interagency Coordinating Committee on the Validation of Alternative Methods) report on the rLLNA.

Task Description: Binary classification. Given a drug SMILES string, predict whether it can cause skin reaction (1) or not (0).

Dataset Statistics: 404 drugs.

In [26]:
from tdc.single_pred import Tox
data = Tox(name = 'Skin Reaction')
split = data.get_split()

Found local copy...
Loading...
Done!


In [27]:
split['test']

Unnamed: 0,Drug_ID,Drug,Y
0,Benzocaine,CCOC(=O)c1ccc(N)cc1,0
1,"2-(Hydroxymethyl)pentane-1,3-diol",CCC(O)C(CO)CO,1
2,Undec-10-enal,C=CCCCCCCCCC=O,1
3,"2-[4-({2,6-Dimethoxy-4-methyl-5-[3-(trifluorom...",COc1cc(C)c2c(Oc3cccc(C(F)(F)F)c3)c(OC)cc(NC(C)...,0
4,3-Hydroxy-2-phenyl-4-quinolinecarboxylic acid,O=C(O)c1c(O)c(-c2ccccc2)nc2ccccc12,0
...,...,...,...
76,3-Propoxybenzoic acid,CCCOc1cccc(C(=O)O)c1,0
77,Glycerol,OCC(O)CO,0
78,Resorcinol,Oc1cccc(O)c1,1
79,Ethyl 4-iodobenzoate,CCOC(=O)c1ccc(I)cc1,1


In [28]:
split['train']

Unnamed: 0,Drug_ID,Drug,Y
0,(2-Bromo-5-propoxyphenyl)-(2-hydroxy-4-methoxy...,CCCOc1ccc(Br)c(C(=O)c2ccc(OC)cc2O)c1,0
1,(2E)-2-Methyl-2-butenal,CC=C(C)C=O,0
2,(2-oxo-1-phenylpyrrolidin-3-yl)-Triphenylphosp...,O=C1C([PH](c2ccccc2)(c2ccccc2)c2ccccc2)CCN1c1c...,0
3,(4Z)-2-Methyl-6-methylene-4-octene,C=C(C=CCC(C)C)CC,0
4,"[4-(Ethoxymethyl)-2,6-dimethoxyphenyl]boronic ...",CCOCc1cc(OC)c(B(O)O)c(OC)c1,0
...,...,...,...
278,Diphenylcyclopropenone,O=c1c(-c2ccccc2)c1-c1ccccc1,1
279,"Disperse Violet 1 (1,4-diamino-9,10-anthracene...",Nc1ccc(N)c2c1C(=O)c1ccccc1C2=O,1
280,HC Red 13,Nc1ccc(N(CO)CCO)cc1[N+](=O)[O-],1
281,Propyl gallate,CCCOC(=O)c1cc(O)c(O)c(O)c1,1


# Carcinogens

Dataset Description: A carcinogen is any substance, radionuclide, or radiation that promotes carcinogenesis, the formation of cancer. This may be due to the ability to damage the genome or to the disruption of cellular metabolic processes.

Task Description: Binary classification. Given a drug SMILES string, predict whether it can cause carcinogen.

Dataset Statistics: 278 drugs.

In [30]:
from tdc.single_pred import Tox
data = Tox(name = 'Carcinogens_Lagunin')
split = data.get_split()

Downloading...
100%|██████████| 20.2k/20.2k [00:00<00:00, 251kiB/s]
Loading...
Done!


In [32]:

split['train']

Unnamed: 0,Drug_ID,Drug,Y
0,Drug_1,CC(CCl)OC(C)CCl,1
1,Drug_2,COc1ccc(C(=O)/C(Br)=C\C(=O)O)cc1,1
2,Drug_3,O=S(=O)(O)c1cc(S(=O)(=O)O)c2c(/N=N/c3ccccc3)c(...,1
3,Drug_4,O=S(=O)(O)c1ccc(/N=N/c2cc(S(=O)(=O)O)c3ccccc3c...,1
4,Drug_6,NCCNc1cccc2ccccc12,1
...,...,...,...
191,Drug_275,CN(C)[C@@H]1C(O)=C(C(=O)NCNC(CCCCN)C(=O)O)C(=O...,0
192,Drug_276,COc1ccc(C[C@@](C)(N)C(=O)O)cc1OC,0
193,Drug_277,CN1CCC(=C2c3ccccc3CCc3sccc32)CC1,0
194,Drug_279,CCC(C)N1CCC2(CC1)N=C1C(=C3NC(=O)/C(C)=C/C=C/[C...,0


In [33]:
split['test']

Unnamed: 0,Drug_ID,Drug,Y
0,Drug_34,Cc1ccc(S(=O)(=O)Oc2ccc(/N=N/c3ccc(-c4ccc(/N=N/...,1
1,Drug_109,COc1cc(C(=O)OCCCN2CCCN(CCCOC(=O)c3cc(OC)c(OC)c...,0
2,Drug_241,N=C(N)NCCCCNC(=N)N,0
3,Drug_260,CCN1C[C@]2(COC)CC[C@H](O)[C@]34C1[C@H](C[C@H]2...,0
4,Drug_155,c1ccc(CNc2ncnc3nc[nH]c23)cc1,0
5,Drug_10,CC1(C)SC2C(NC(=O)[C@H](N)c3ccccc3)C(=O)N2[C@@H...,1
6,Drug_147,C[C@H]1C(=O)O[C@@H]2CCN3CC=C(COC(=O)[C@](C)(O)...,0
7,Drug_204,CS(=N)(=O)CC[C@H](N)C(=O)O,0
8,Drug_145,COc1ccc2[nH]cc(CCNC(C)=O)c2c1,0
9,Drug_156,CN1C2CCCC1CC(=O)C2,0


# Tox21

Dataset Description: Tox21 is a data challenge which contains qualitative toxicity measurements for 7,831 compounds on 12 different targets, such as nuclear receptors and stree response pathways.

Task Description: Binary classification. Given a drug SMILES string, predict the toxicity in a specific assay.

Dataset Statistics: Depends on various assays ~6,000 drugs.

Dataset Split: Random Split Scaffold Split

Note: Tox21 contains multiple assays data. To retrieve the specific labels for that assay, specify the label name in the label_name variable to the data loader. You can find all available label names by calling:

In [34]:
from tdc.utils import retrieve_label_name_list
label_list = retrieve_label_name_list('Tox21')

In [35]:
from tdc.single_pred import Tox
data = Tox(name = 'Tox21', label_name = label_list[0])
split = data.get_split()

Downloading...
100%|██████████| 712k/712k [00:00<00:00, 1.52MiB/s]
Loading...
Done!


In [36]:
split['test']

Unnamed: 0,Drug_ID,Drug,Y
0,TOX25807,Cc1ccc(S(=O)(=O)N[C@@H](CCCCN)C(=O)CCl)cc1,0.0
1,TOX27823,[2H]C([2H])([2H])[C@H](N)Cc1ccccc1.[2H]C([2H])...,0.0
2,TOX231,O=C(O)C=Cc1ccc(O)c(O)c1,1.0
3,TOX26517,N=C(N)n1cccn1,0.0
4,TOX24131,Nc1ccc(S(=O)(=O)Nc2ccnn2-c2ccccc2)cc1,0.0
...,...,...,...
1448,TOX24933,O=C=NCC1CCCC(CN=C=O)C1,0.0
1449,TOX4628,NC(=O)NNC(N)=O,0.0
1450,TOX5668,COC(=O)c1ccc(C)cc1,0.0
1451,TOX27784,CC(CN(C)C)C(C)(O)Cc1ccc(Cl)cc1,0.0


In [37]:
split['train']

Unnamed: 0,Drug_ID,Drug,Y
0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,0.0
1,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,0.0
2,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,0.0
3,TOX5110,CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C,0.0
4,TOX25232,O=C(O)Cc1cc(I)c(Oc2ccc(O)c(I)c2)c(I)c1,0.0
...,...,...,...
5081,TOX25182,Cc1nc2ccccn2c(=O)c1CCN1CCC(C(=O)c2ccc(F)cc2)CC1,0.0
5082,TOX25180,NC(=O)CN1CC(O)CC1=O,0.0
5083,TOX25184,CCCNCC(O)COc1ccccc1C(=O)CCc1ccccc1,0.0
5084,TOX2371,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,1.0
