In [1]:
import pandas as pd, numpy as np
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
from dataElaboration import ElabSmiles

## Useful Functions

In [2]:
def maskError(df):
    smilesCol = df.columns[df.columns.str.contains("smiles")][0]
    return (df[smilesCol].astype(str).apply(Chem.MolFromSmiles).isnull()) | (df["CAS"].str.len()<=1)

In [3]:
def convLab(x):
    if x in ["DTer","D","Dter","D(MT)","DTer(MT)"]:
        return "1"
    elif x in ["No Evidence","N"]:
        return "0"
    elif x=="No Data":
        return "Undefined"
    else:
        return x

## Importing CAESAR dataset

In [4]:
# Reading CAESAR dataset parsing only few columns and rename them for adattability
caesarColumns={"SMILES":"CAESARsmiles","CAESAR class":"CAESARlabel", "CAS_RN":"CAS", 
               "Chemical Name":"CAESARname", "FDA Classification":"FDA"}
caesar=pd.read_csv("data/CAESAR_withStereo.csv", delimiter=";", usecols=caesarColumns.keys())
caesar.rename(columns=caesarColumns, inplace=True)

In [5]:
caesar

Unnamed: 0,CAESARname,CAS,CAESARsmiles,FDA,CAESARlabel
0,Acetaldehyde,75-07-0,CC=O,A/B,N
1,Acetaminophen,103-90-2,CC(=O)NC1=CC=C(O)C=C1,B,N
2,Acetazolamide,59-66-5,CC(=O)NC1=NN=C(S1)S(N)(=O)=O,C,D
3,Acetohexamide,968-81-0,CC(=O)C1=CC=C(C=C1)S(=O)(=O)NC(=O)NC2CCCCC2,C,D
4,Acid(isotretinoin),4759-48-2,CC(\C=C\C1=C(C)CCCC1(C)C)=C/C=C/C(C)=C\C(O)=O,X,D
...,...,...,...,...,...
287,Tyropanoate,27293-82-9,CCCC(=O)NC1=C(I)C=C(I)C(CC(CC)C(O)=O)=C1I,D,D
288,Valproic Acid,99-66-1,CCCC(CCC)C(O)=O,D,D
289,Vitamin a,11103-57-4,CC(=C/CO)\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C,A (X if used above US RDA),N
290,Warfarin,81-81-2,CC(=O)CC(C1=CC=CC=C1)C2=C(O)C3=C(OC2=O)C=CC=C3,D (X according to manufactor),D


In [6]:
# Convert labels for uniformity
caesar["CAESARlabel"]=caesar["CAESARlabel"].apply(convLab)

In [7]:
caesar.value_counts("CAESARlabel")

CAESARlabel
1    201
0     91
dtype: int64

## Importing P&G dataset

In [9]:
pgColumns={"SMILES":"PGsmiles","DEVTOX-Data":"PGlabel", "CAS":"CAS", "Name":"PGname",}
pg=pd.read_csv("data/PeG_Original.csv", usecols=pgColumns.keys())
pg.rename(columns=pgColumns, inplace=True)

In [10]:
# Convert labels for uniformity
pg["PGlabel"]=pg["PGlabel"].apply(convLab)

In [11]:
pg

Unnamed: 0,CAS,PGsmiles,PGname,PGlabel
0,_,,Metals,1
1,6055-19-2,C1CNP(=O)(OC1)N(CCCl)CCCl.O,Cyclophosphamide,1
2,3778-73-2,C1CN(P(=O)(OC1)NCCCl)CCCl,Ifosfamide,1
3,470-90-6,CCOP(=O)(OCC)OC(=CCl)C1=C(C=C(C=C1)Cl)Cl,Chlorfenvinphos,1
4,298-00-0,COP(=S)(OC)OC1=CC=C(C=C1)[N+](=O)[O-],Methyl parathion,1
...,...,...,...,...
711,93106-60-6,CCN1CCN(CC1)C2=C(C=C3C(=C2)N(C=C(C3=O)C(=O)O)C...,Enrofloxacin,1
712,85721-33-1,C1CC1N2C=C(C(=O)C3=CC(=C(C=C32)N4CCNCC4)F)C(=O)O,Ciprofloxacin,1
713,115550-35-1,CN1CCN(CC1)C2=C(C=C3C4=C2OCN(N4C=C(C3=O)C(=O)O...,Marbofloxacin,1
714,100986-85-4,C[C@H]1COC2=C3N1C=C(C(=O)C3=CC(=C2N4CCN(CC4)C)...,Levofloxacin,1


In [12]:
pg[maskError(pg)]

Unnamed: 0,CAS,PGsmiles,PGname,PGlabel
0,_,,Metals,1
611,308064-18-8,,"Di-Me, Me Ph cyclosiloxanes (PMxMMy)",1


In [13]:
pg.drop(pg[maskError(pg)].index, inplace=True)

## Merging by CAS & Elaboration

In [15]:
merged=pd.merge(pg , caesar , on="CAS", how="outer")

In [19]:
merged

Unnamed: 0,CAS,PGsmiles,PGname,PGlabel,CAESARname,CAESARsmiles,FDA,CAESARlabel
0,6055-19-2,C1CNP(=O)(OC1)N(CCCl)CCCl.O,Cyclophosphamide,1,,,,
1,3778-73-2,C1CN(P(=O)(OC1)NCCCl)CCCl,Ifosfamide,1,,,,
2,470-90-6,CCOP(=O)(OCC)OC(=CCl)C1=C(C=C(C=C1)Cl)Cl,Chlorfenvinphos,1,,,,
3,298-00-0,COP(=S)(OC)OC1=CC=C(C=C1)[N+](=O)[O-],Methyl parathion,1,,,,
4,732-11-6,COP(=S)(OC)SCN1C(=O)C2=CC=CC=C2C1=O,Phosmet,1,,,,
...,...,...,...,...,...,...,...,...
891,91-81-6,,,,Tripelennamine,CN(C)CCN(CC1=CC=CC=C1)C2=CC=CC=N2,B,0
892,486-12-4,,,,Triprolidine,CC1=CC=C(C=C1)\C(=C/CN2CCCC2)C3=CC=CC=N3,C,1
893,27293-82-9,,,,Tyropanoate,CCCC(=O)NC1=C(I)C=C(I)C(CC(CC)C(O)=O)=C1I,D,1
894,11103-57-4,,,,Vitamin a,CC(=C/CO)\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C,A (X if used above US RDA),0


In [10]:
# Let's elaborated SMILES ! Check if is a mixture, inorganic, clean stereochemical information and salt neutralization !
merged["CAESARsmilesElaborated"] = merged["CAESARsmiles"].astype(str).apply(ElabSmiles)
merged["PGsmilesElaborated"] = merged["PGsmiles"].astype(str).apply(ElabSmiles)

In [11]:
invalidsSmiles = ["inorganic","mixture",]
maskInvalids = (merged["CAESARsmilesElaborated"].isin(invalidsSmiles)) | (merged["PGsmilesElaborated"].isin(invalidsSmiles))
maskClassC = merged["FDA"].str.contains("C$|C \(", regex=True).fillna(False)

maskDiscard = maskInvalids | maskClassC

In [12]:
merged.drop(merged[maskDiscard].index, inplace=True)

In [13]:
shared=(~merged["PGlabel"].isna()) & (~merged["CAESARlabel"].isna())

In [14]:
# After elaboration of SMILES, no shared chemicals have different elaborated SMILES
merged[shared & (merged["CAESARsmilesElaborated"]!=merged["PGsmilesElaborated"])]

Unnamed: 0,CAS,PGsmiles,PGname,PGlabel,CAESARname,CAESARsmiles,FDA,CAESARlabel,CAESARsmilesElaborated,PGsmilesElaborated


In [16]:
# Removing cas rows having 2 discordant labels
merged.drop(merged[shared & (merged["PGlabel"]!=merged["CAESARlabel"])].index, inplace=True)

In [17]:
merged

Unnamed: 0,CAS,PGsmiles,PGname,PGlabel,CAESARname,CAESARsmiles,FDA,CAESARlabel,CAESARsmilesElaborated,PGsmilesElaborated
0,6055-19-2,C1CNP(=O)(OC1)N(CCCl)CCCl.O,Cyclophosphamide,1,,,,,,O=P1(N(CCCl)CCCl)NCCCO1
1,3778-73-2,C1CN(P(=O)(OC1)NCCCl)CCCl,Ifosfamide,1,,,,,,O=P1(NCCCl)OCCCN1CCCl
2,470-90-6,CCOP(=O)(OCC)OC(=CCl)C1=C(C=C(C=C1)Cl)Cl,Chlorfenvinphos,1,,,,,,CCOP(=O)(OCC)OC(=CCl)c1ccc(Cl)cc1Cl
3,298-00-0,COP(=S)(OC)OC1=CC=C(C=C1)[N+](=O)[O-],Methyl parathion,1,,,,,,COP(=S)(OC)Oc1ccc([N+](=O)[O-])cc1
4,732-11-6,COP(=S)(OC)SCN1C(=O)C2=CC=CC=C2C1=O,Phosmet,1,,,,,,COP(=S)(OC)SCN1C(=O)c2ccccc2C1=O
...,...,...,...,...,...,...,...,...,...,...
887,34787-01-4,,,,Ticarcillin,[H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@@]2([H])NC(=...,B,0,CC1(C)SC2C(NC(=O)C(C(=O)O)c3ccsc3)C(=O)N2C1C(=O)O,
891,91-81-6,,,,Tripelennamine,CN(C)CCN(CC1=CC=CC=C1)C2=CC=CC=N2,B,0,CN(C)CCN(Cc1ccccc1)c1ccccn1,
893,27293-82-9,,,,Tyropanoate,CCCC(=O)NC1=C(I)C=C(I)C(CC(CC)C(O)=O)=C1I,D,1,CCCC(=O)Nc1c(I)cc(I)c(CC(CC)C(=O)O)c1I,
894,11103-57-4,,,,Vitamin a,CC(=C/CO)\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C,A (X if used above US RDA),0,CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CCO,


## Finally, working with SMILES and Labels

In [18]:
## let's concatenate PeG and CAESAR SMILES with their relative label
pgElab = merged[["PGsmilesElaborated","PGlabel"]].dropna()
caesarElab = merged[["CAESARsmilesElaborated","CAESARlabel"]].dropna()
caesarElab.columns = pgElab.columns = ["SMILES","label"]
concatElab = pd.concat([pgElab, caesarElab], axis=0)

In [19]:
# Fusing SMILES and label in a set
concatElab = concatElab.groupby("SMILES",as_index=False).agg(set)

In [20]:
# Only chemicals with concordant labels are stored !
DEF = concatElab[(concatElab["label"].apply(len)==1) & (concatElab["label"].apply(lambda x: "Undefined" not in x) ) ]
DEF ["label"] = DEF ["label"].apply(lambda x: list(x)[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DEF ["label"] = DEF ["label"].apply(lambda x: list(x)[0])


In [21]:
DEF.value_counts("label")

label
1    609
0    124
dtype: int64

In [22]:
len(DEF)

733

## Internal diversity

In [1]:
# Need moses package to fast computing
from moses.metrics import internal_diversity

  _mcf.append(_pains, sort=True)['smarts'].values]


In [5]:
internal_diversity(DEF.SMILES)

0.9117522862905257