In [1]:
import pandas as pd, numpy as np
import ast

In [2]:
from dataElaboration import ElabSmiles

In [3]:
# chembl datas filter are exposed in main text of work
rawData = pd.read_csv("data/chemblRawData.csv")

In [4]:
rawData["elab_smiles"]=rawData["canonical_smiles"].apply(ElabSmiles)

In [5]:
groupWD=rawData.groupby(by="elab_smiles", group_keys=False)["withdrawn_flag"].apply(set)

In [6]:
groupWD[groupWD.apply(lambda x: len(x)>1)]

Series([], Name: withdrawn_flag, dtype: object)

In [7]:
invalidMols=["inorganic","mixture"]
maskInvalids = rawData["elab_smiles"].isin(invalidMols)

In [8]:
sum(maskInvalids)

33

In [9]:
rawData.drop(rawData[maskInvalids].index, inplace=True)

## Prediction unique and unshared SMILES

In [10]:
dataTrain = pd.read_csv("data/curedData.csv")

In [11]:
uniqueSmiles = pd.Series(rawData["elab_smiles"].unique())

# we discard smiles already present in our training set.
sharedSmiles = uniqueSmiles.isin( dataTrain.SMILES )
uniqueSmiles.drop(uniqueSmiles[sharedSmiles].index, inplace=True)

# SMILES to be predicted by TISBE model
uniqueSmiles=uniqueSmiles.to_frame("elab_smiles")

In [12]:
uniqueSmiles

Unnamed: 0,elab_smiles
0,COc1cc2nc(N3CCN(C(=O)c4ccco4)CC3)nc(N)c2cc1OC
3,CCn1cc(C(=O)O)c(=O)c2ccc(C)nc21
4,COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1
5,CC1(C)C(C(=O)O)N2C(=O)CC2S1(=O)=O
6,CC1(Cn2ccnn2)C(C(=O)O)N2C(=O)CC2S1(=O)=O
...,...
1785,Cc1cn2nc(-c3cc(=O)n4cc(N5CCNC6(CC6)C5)ccc4n3)c...
1786,CCCCCCC(=O)OCC(COC(=O)CCCCCC)OC(=O)CCCCCC
1787,NC(=O)OCC(N)Cc1ccccc1
1788,Cc1ccc(C2OC(CO)C(O)C(O)C2O)cc1Cc1ccc(-c2ccc(F)...


In [13]:
from consensusModel import TISBE as model

In [14]:
uniqueSmiles["pred"] = model.predictFromSmiles(uniqueSmiles["elab_smiles"].values)

In [15]:
predictedData = rawData.merge(uniqueSmiles, on="elab_smiles", how="right")

In [16]:
uniqueSmiles.value_counts("pred")

pred
1    1364
0     124
dtype: int64

## Grouping Chemicals by SMILES

In [17]:
import itertools

In [18]:
def groupingStrings (group):
    group= list (group)
    if len(group)==1:
        return group[0]
    
    return " | ".join(group)

def groupingTuple(group):
    group= [ast.literal_eval (x) for x in group]
    uniq = set(itertools.chain(*group))
    return tuple(uniq)

In [19]:
curedData = predictedData.groupby("elab_smiles", as_index=False).agg(
    {"pred":"first", "canonical_smiles":groupingStrings, "withdrawn_flag":"first",
     "pref_name":groupingStrings,"chembl_id":groupingStrings, "level5":groupingTuple}
)

In [20]:
curedData

Unnamed: 0,elab_smiles,pred,canonical_smiles,withdrawn_flag,pref_name,chembl_id,level5
0,Brc1c(NC2=NCCN2)ccc2nccnc12,1,Brc1c(NC2=NCCN2)ccc2nccnc12,0,BRIMONIDINE,CHEMBL844,"(S01EA05, D11AX21, S01GA07)"
1,C#CC(C)(O)CC,1,C#CC(C)(O)CC,0,MEPARFYNOL,CHEMBL501613,"(N05CX03, N05CM15)"
2,C#CC(O)(C=CCl)CC,1,C#CC(O)(/C=C/Cl)CC,0,ETHCHLORVYNOL,CHEMBL591,"(N05CM08,)"
3,C#CC1(O)CCC2C3=C(C=CC21CC)C1=C(CC3)CC(=O)CC1,1,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CCC4=C3C=...,0,GESTRINONE,CHEMBL1868702,"(G03XA02,)"
4,C#CC1(O)CCC2C3C(C)C=C4CC(=O)CCC4C3CCC21C,1,C#C[C@]1(O)CC[C@H]2[C@H]3[C@H](CC[C@@]21C)C1=C...,0,TIBOLONE,CHEMBL2103774,"(G03CX01,)"
...,...,...,...,...,...,...,...
1483,c1ccc2c(CC3=NCCN3)cccc2c1,1,c1ccc2c(CC3=NCCN3)cccc2c1,0,NAPHAZOLINE,CHEMBL761,"(R01AB02, S01GA01, S01GA51, R01AA08)"
1484,c1ccc2c(c1)CCCC2C1=NCCN1,1,c1ccc2c(c1)CCCC2C1=NCCN1,0,TETRAHYDROZOLINE,CHEMBL1266,"(S01GA52, R01AB03, S01GA02, R01AA06)"
1485,c1ccc2c(c1)Sc1ccccc1N2CC1CN2CCC1CC2,1,c1ccc2c(c1)Sc1ccccc1N2CC1CN2CCC1CC2,0,MEQUITAZINE,CHEMBL73451,"(R06AD07,)"
1486,c1cnc(N2CCN(Cc3ccc4c(c3)OCO4)CC2)nc1,1,c1cnc(N2CCN(Cc3ccc4c(c3)OCO4)CC2)nc1,0,PIRIBEDIL,CHEMBL1371770,"(N04BC08,)"


## Last manually discarding process

In [21]:
# These SMILES were manually discarded due to their discordant original chemicals

toDiscard=["O=C(O)O",
"CC(=O)O",
"O=C(O)c1ccccc1O",
"CC(O)C(=O)O",]

In [22]:
curedData.drop(curedData[curedData["elab_smiles"].isin(toDiscard)].index,inplace=True)

## Analisys

In [37]:
curedData=pd.read_csv("results/curedChembl.csv", converters={"level5":ast.literal_eval})

In [39]:
curedData.value_counts("withdrawn_flag")

withdrawn_flag
0    1386
1      98
dtype: int64

In [40]:
curedData["level1"]=curedData["level5"].apply(lambda x: tuple(set(y[0] for y in x)) )

In [41]:
maskATC=( curedData["level1"].apply(lambda tpl: any(  c in tpl  for c in "AJCDNR" )) ) & \
        (curedData["withdrawn_flag"]==0)

In [42]:
curedData[maskATC]

Unnamed: 0,elab_smiles,pred,canonical_smiles,withdrawn_flag,pref_name,chembl_id,level5,level1
0,Brc1c(NC2=NCCN2)ccc2nccnc12,1,Brc1c(NC2=NCCN2)ccc2nccnc12,0,BRIMONIDINE,CHEMBL844,"(S01EA05, D11AX21, S01GA07)","(D, S)"
1,C#CC(C)(O)CC,1,C#CC(C)(O)CC,0,MEPARFYNOL,CHEMBL501613,"(N05CX03, N05CM15)","(N,)"
2,C#CC(O)(C=CCl)CC,1,C#CC(O)(/C=C/Cl)CC,0,ETHCHLORVYNOL,CHEMBL591,"(N05CM08,)","(N,)"
8,C#CCN(C)C(C)Cc1ccccc1,1,C#CCN(C)[C@H](C)Cc1ccccc1,0,SELEGILINE,CHEMBL972,"(N04BD01,)","(N,)"
9,C#CCN(C)Cc1ccccc1,1,C#CCN(C)Cc1ccccc1,0,PARGYLINE,CHEMBL673,"(C02KC01,)","(C,)"
...,...,...,...,...,...,...,...,...
1479,c1ccc2c(CC3=NCCN3)cccc2c1,1,c1ccc2c(CC3=NCCN3)cccc2c1,0,NAPHAZOLINE,CHEMBL761,"(R01AB02, S01GA01, S01GA51, R01AA08)","(S, R)"
1480,c1ccc2c(c1)CCCC2C1=NCCN1,1,c1ccc2c(c1)CCCC2C1=NCCN1,0,TETRAHYDROZOLINE,CHEMBL1266,"(S01GA52, R01AB03, S01GA02, R01AA06)","(S, R)"
1481,c1ccc2c(c1)Sc1ccccc1N2CC1CN2CCC1CC2,1,c1ccc2c(c1)Sc1ccccc1N2CC1CN2CCC1CC2,0,MEQUITAZINE,CHEMBL73451,"(R06AD07,)","(R,)"
1482,c1cnc(N2CCN(Cc3ccc4c(c3)OCO4)CC2)nc1,1,c1cnc(N2CCN(Cc3ccc4c(c3)OCO4)CC2)nc1,0,PIRIBEDIL,CHEMBL1371770,"(N04BC08,)","(N,)"


In [43]:
from collections import Counter

In [44]:
counts=Counter(itertools.chain(*curedData[maskATC]["level1"]))

In [45]:
counts=Counter(itertools.chain(*curedData[maskATC & (curedData["pred"]==1)]["level1"]))

In [46]:
counts

Counter({'N': 244,
         'C': 177,
         'A': 167,
         'D': 139,
         'J': 126,
         'R': 98,
         'S': 77,
         'G': 26,
         'H': 19,
         'V': 8,
         'B': 8,
         'M': 5,
         'P': 4,
         'L': 1})

In [47]:
countsTox=Counter(itertools.chain(*curedData[maskATC & (curedData["pred"]==1)]["level1"]))

In [48]:
countsTox

Counter({'N': 244,
         'C': 177,
         'A': 167,
         'D': 139,
         'J': 126,
         'R': 98,
         'S': 77,
         'G': 26,
         'H': 19,
         'V': 8,
         'B': 8,
         'M': 5,
         'P': 4,
         'L': 1})

In [49]:
countsNoTox=Counter(itertools.chain(*curedData[maskATC & (curedData["pred"]==0)]["level1"]))

In [50]:
countsNoTox

Counter({'J': 46,
         'R': 21,
         'N': 13,
         'C': 13,
         'A': 13,
         'D': 5,
         'S': 3,
         'P': 1,
         'G': 1,
         'M': 1,
         'H': 1})

In [56]:
countsPreds = pd.concat([pd.DataFrame([countsNoTox]),
                        pd.DataFrame([countsTox]) ], axis=0, ignore_index=True)

In [60]:
countsPreds.T

Unnamed: 0,0,1
J,46.0,126.0
N,13.0,244.0
R,21.0,98.0
C,13.0,177.0
A,13.0,167.0
D,5.0,139.0
P,1.0,4.0
S,3.0,77.0
G,1.0,26.0
M,1.0,5.0


## TEDDY Screening

In [67]:
teddy = pd.read_csv("data/teddy.csv")

In [73]:
teddy["prediction"]=model.predictFromSmiles(teddy["elabSMILES"].values)

In [74]:
teddy[teddy["prediction"]==0]

Unnamed: 0,MedProduct,ActiveSub,firstPediatricDate,Orphan,TherapeuticArea,ATC,MainCondition,MinimumApprovedAge,PediatricTherapeuticIndications,elabSMILES,prediction
8,Cayston,aztreonam lysine,2012,yes,Infectious diseases,J- Antiinfectives,Cystic Fibrosis Respiratory Tract Infections,> 6 years,Cayston is indicated for the suppressive thera...,CC1C(=NC(=O)C(NOC(C)(C)C(=O)O)c2csc(N)n2)C(=O)...,0
37,Orkambi,"Lumacaftor, ivacaftor",2015,no,Respiratory,R- Respiratory,Cystic Fibrosis,> 2 years,Orkambi tablesis indicated for the treatment o...,CC(C)(C)c1cc(C(C)(C)C)c(NC(=O)c2c[nH]c3ccccc3c...,0
52,Sirturo,bedaquiline fumarate,2019,yes,Infectious diseases,J- Antiinfectives,"Tuberculosis, Multidrug-Resistant",> 12 years weighing at least 30 kg,SIRTURO is indicated for use as part of an app...,COc1nc2ccc(Br)cc2cc1C(c1ccccc1)C(O)(CCN(C)C)c1...,0
56,Sprycel,dasatinib,2018,yes,Oncology,L- Antineoplastic and immunomodulating,"Leukemia, Myelogenous, Chronic, BCR-ABL Positi...",> 1 year,SPRYCEL is indicated for the treatment of paed...,Cc1nc(Nc2ncc(C(=O)Nc3c(C)cccc3Cl)s2)cc(N2CCN(C...,0
80,Zavicefta,"avibactam sodium, ceftazidime pentahydrate",2020,no,Infectious diseases,J- Antiinfectives,"Pneumonia, Bacterial Soft Tissue Infections Pn...",> 3 months,Zavicefta is indicated in adults and paediatri...,CC(C)(ON=C(C(=O)NC1C(=O)N2C(C(=O)[O-])=C(C[n+]...,0
82,Zinforo,Ceftaroline fosamil,2016,no,Infectious diseases,J- Antiinfectives,Community-Acquired Infections Pneumonia Skin D...,all ages,Zinforo is indicated for the treatment of the ...,CCON=C(C(=O)NC1C(=O)N2C(C(=O)O)=C(Sc3nc(-c4cc[...,0
