# Deals with the NaNs

In [84]:
#import all the modeules we've used before

import itertools
import numpy as np
import pandas as pd
import statsmodels.api as sm
from matplotlib import pyplot as plt
import seaborn as sns

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import rdkit
# Render the figure in a notebook:
%matplotlib inline  

from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score

In [89]:
df = pd.read_csv('tox21.csv')
df
# df = df.drop([1322, 2290,2297,3558, 4565,4649, 5538, 6723, 7830]) #we dropped these values, they're no longer there


Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7826,,,,,,,,0.0,,0.0,,,TOX2725,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...
7827,1.0,1.0,0.0,0.0,1.0,0.0,,,0.0,0.0,,0.0,TOX2370,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...
7828,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX2371,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...
7829,1.0,1.0,0.0,,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,TOX2377,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...


we will get rid of the nontoxicnans, because they may be hiding toxicity under their Nan. We will the toxicnans, because we know they're toxic.

In [90]:
df_only_data = df.drop(columns=['mol_id', 'smiles']) #drop the columns.

toxicnans = [] #compounds that are toxic, and contain a nan value, containing the index
nontoxicnans = [] # compounds that aren't toxic, and contain a nan value.
nonans = [] #compounds w/ no nans, toxic or not toxic
toxics = []

for row in df_only_data.index: #select each row
    hasnan = False
    istoxic = False
    for i in df_only_data.iloc[row]: #select each row
        # print(i,type(i))
        if i != 0:
            if i != 1:
                hasnan = True
        if i == 1.0:
            istoxic = True
    if hasnan == True and istoxic == True:
        toxicnans.append(row)
    elif hasnan == True and istoxic == False:
        nontoxicnans.append(row)
    elif hasnan == False:
        nonans.append(row)
    if istoxic == True:
        toxics.append(row)

In [91]:
df_toxicity = pd.DataFrame(df['smiles']) #make new dataframe that will be the final one containing the toxicity column)
df_toxicity['mol_id'] = pd.DataFrame(df['mol_id'])
df_toxicity['toxic']=0
toxicnans

[0,
 7,
 14,
 20,
 22,
 23,
 33,
 35,
 42,
 44,
 46,
 48,
 49,
 53,
 54,
 64,
 65,
 71,
 74,
 77,
 78,
 93,
 94,
 97,
 99,
 102,
 105,
 111,
 116,
 118,
 129,
 130,
 131,
 138,
 140,
 144,
 147,
 149,
 158,
 159,
 163,
 166,
 170,
 171,
 180,
 188,
 190,
 195,
 196,
 197,
 199,
 200,
 205,
 206,
 210,
 212,
 213,
 215,
 218,
 221,
 222,
 223,
 226,
 228,
 232,
 233,
 234,
 235,
 236,
 237,
 239,
 243,
 244,
 245,
 248,
 253,
 254,
 255,
 267,
 269,
 271,
 273,
 280,
 282,
 283,
 288,
 291,
 292,
 293,
 294,
 296,
 298,
 299,
 300,
 302,
 304,
 313,
 318,
 319,
 328,
 331,
 333,
 335,
 337,
 338,
 344,
 345,
 346,
 350,
 353,
 358,
 367,
 378,
 383,
 386,
 387,
 393,
 395,
 398,
 413,
 414,
 417,
 419,
 420,
 422,
 430,
 433,
 434,
 437,
 441,
 443,
 447,
 448,
 450,
 456,
 460,
 463,
 470,
 480,
 487,
 490,
 493,
 494,
 496,
 499,
 500,
 501,
 503,
 504,
 513,
 518,
 519,
 524,
 525,
 531,
 534,
 536,
 537,
 539,
 541,
 548,
 550,
 554,
 558,
 563,
 565,
 566,
 584,
 589,
 595,
 596,
 

In [92]:
for i in toxicnans: #iterate through the indexes of toxicnans, and sets their toxicity to 1
    df_toxicity.loc[i, 'toxic'] = 1
df_toxicity

Unnamed: 0,smiles,mol_id,toxic
0,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,TOX3021,1
1,CCN1C(=O)NC(c2ccccc2)C1=O,TOX3020,0
2,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,TOX3024,0
3,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,TOX3027,0
4,CC(O)(P(=O)(O)O)P(=O)(O)O,TOX20800,0
...,...,...,...
7826,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,TOX2725,0
7827,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,TOX2370,1
7828,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,TOX2371,0
7829,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,TOX2377,1


In [93]:
df_no_nontoxic_nans = df_toxicity.drop(nontoxicnans) #now drop the non toxic nans, they are unsafe because we can't be sure that nan isn't hiding toxicity
df_no_nontoxic_nans

Unnamed: 0,smiles,mol_id,toxic
0,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,TOX3021,1
4,CC(O)(P(=O)(O)O)P(=O)(O)O,TOX20800,0
6,O=S(=O)(Cl)c1ccccc1,TOX6619,0
7,O=C(O)Cc1cc(I)c(Oc2ccc(O)c(I)c2)c(I)c1,TOX25232,1
12,CC(C)COC(=O)C(C)C,TOX6612,0
...,...,...,...
7818,c1cnc(N2CCN(Cc3ccc4c(c3)OCO4)CC2)nc1,TOX25188,0
7827,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,TOX2370,1
7828,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,TOX2371,0
7829,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,TOX2377,1


In [94]:
for i in toxics: #iterate through the indexes of all other toxics, and sets their toxicity to 1
    df_no_nontoxic_nans.loc[i, 'toxic'] = 1
df_no_nontoxic_nans #this is our final dataframe.

Unnamed: 0,smiles,mol_id,toxic
0,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,TOX3021,1
4,CC(O)(P(=O)(O)O)P(=O)(O)O,TOX20800,0
6,O=S(=O)(Cl)c1ccccc1,TOX6619,0
7,O=C(O)Cc1cc(I)c(Oc2ccc(O)c(I)c2)c(I)c1,TOX25232,1
12,CC(C)COC(=O)C(C)C,TOX6612,0
...,...,...,...
7818,c1cnc(N2CCN(Cc3ccc4c(c3)OCO4)CC2)nc1,TOX25188,0
7827,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,TOX2370,1
7828,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,TOX2371,1
7829,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,TOX2377,1


In [None]:
df_no_nontoxic_nans.to_csv('toxicity_no_nontoxic_nans.csv')