## Cleaning Alt Inactives
The alternative inactive dataset is cleaned and canonical smiles are extracted

In [1]:
# importing libraries

import pandas as pd
import numpy as np
import pubchempy as pcp
import time

In [2]:
# reading and importing active dataset

df = pd.read_csv('alt_inactive.csv')
df

Unnamed: 0,sid,cid,sidsrcname,sidextid,subssynonym,sidmdate,depdate,depcatg,annotation,activity
0,845496,647846,MLSMR,MLS000032637,MLS000032637|SMR000006133|1-[2-(3-Cyano-8-meth...,20190115,20050604,Governmental Organizations|NIH Initiatives,,Inactive
1,846659,649054,MLSMR,MLS000033467,"MLS000033467|SMR000010003|[5-(3,4,5-Trimethoxy...",20120301,20050604,Governmental Organizations|NIH Initiatives,,Inactive
2,856443,657803,MLSMR,MLS000038396,MLS000038396|SMR000034155,20120301,20050629,Governmental Organizations|NIH Initiatives,,Inactive
3,857902,6603435,MLSMR,MLS000080127,MLS000080127|SMR000036255,20190206,20050629,Governmental Organizations|NIH Initiatives,,Inactive
4,860525,661810,MLSMR,MLS000080359,MLS000080359|SMR000037404,20120301,20050629,Governmental Organizations|NIH Initiatives,,Inactive
...,...,...,...,...,...,...,...,...,...,...
124,24827964,2405263,MLSMR,MLS001005282,MLS001005282|SMR000348611,20120301,20070705,Governmental Organizations|NIH Initiatives,,Inactive
125,24829061,2124454,MLSMR,MLS001005386,MLS001005386|SMR000348751,20120301,20070705,Governmental Organizations|NIH Initiatives,,Inactive
126,24830089,1357612,MLSMR,MLS000770824,MLS000770824|SMR000344032,20210728,20070705,Governmental Organizations|NIH Initiatives,,Inactive
127,24833932,6362905,MLSMR,MLS000773474,MLS000773474|SMR000364172,20190115,20070705,Governmental Organizations|NIH Initiatives,,Inactive


In [3]:
# checking for missing values in all columns

df.isnull().sum()

sid              0
cid              0
sidsrcname       0
sidextid         0
subssynonym      0
sidmdate         0
depdate          0
depcatg          0
annotation     129
activity         0
dtype: int64

In [4]:
# checking all column names

df.columns

Index(['sid', 'cid', 'sidsrcname', 'sidextid', 'subssynonym', 'sidmdate',
       'depdate', 'depcatg', 'annotation', 'activity'],
      dtype='object')

In [5]:
# removing 'unecessary' columns

df = df.drop(['sid', 'sidsrcname', 'sidextid', 'subssynonym', 'sidmdate', 'depdate', 'depcatg', 'annotation'], axis = 'columns')
df

Unnamed: 0,cid,activity
0,647846,Inactive
1,649054,Inactive
2,657803,Inactive
3,6603435,Inactive
4,661810,Inactive
...,...,...
124,2405263,Inactive
125,2124454,Inactive
126,1357612,Inactive
127,6362905,Inactive


In [6]:
# extracting canonical smiles from pubchem database

cid_list = np.array(df.cid)

start = time.time()

smiles_df = pcp.get_properties(['CanonicalSMILES', 'IUPACname'], np.nditer(cid_list), 'cid', as_dataframe = True)

end = time.time()

print("The time take in seconds for extraction of canonical smiles is :  ", end-start)
smiles_df

The time take in seconds for extraction of canonical smiles is :   3.5881495475769043


Unnamed: 0_level_0,CanonicalSMILES,IUPACName
CID,Unnamed: 1_level_1,Unnamed: 2_level_1
647846,CC1=C2C(=CC=C1)C=C(C(=N2)NCCNC(=O)NC3CCCCC3)C#N,1-[2-[(3-cyano-8-methylquinolin-2-yl)amino]eth...
649054,CCCCOC(=O)CSC1=NN=C(O1)C2=CC(=C(C(=C2)OC)OC)OC,"butyl 2-[[5-(3,4,5-trimethoxyphenyl)-1,3,4-oxa..."
657803,CN1C(=CN=C1C2CC(=O)C3=C(O2)C=CC(=C3)Cl)Cl,"6-chloro-2-(5-chloro-1-methylimidazol-2-yl)-2,..."
6603435,CCCCCCN1C2=C(CCC2)C(=N)C3=C1CCC3.Br,"2-hexyl-2-azatricyclo[7.3.0.03,7]dodeca-1(9),3..."
661810,CC(=O)C1=C(C2=C(S1)N=C(C=C2C(F)(F)F)C3=CC=CS3)N,1-[3-amino-6-thiophen-2-yl-4-(trifluoromethyl)...
...,...,...
2405263,CNC(=O)COC(=O)C1=CC=CC=C1C(=O)C2=CC(=C(C=C2)Cl...,[2-(methylamino)-2-oxoethyl] 2-(4-chloro-3-nit...
2124454,CC1=C(C(=NO1)C)COC(=O)C2=CC(=C(N=C2)Cl)Cl,"(3,5-dimethyl-1,2-oxazol-4-yl)methyl 5,6-dichl..."
1357612,CC1=CC(=CC(=C1)OCC(=O)N2CCN(CC2)C3=CC=CC=C3OC)C,"2-(3,5-dimethylphenoxy)-1-[4-(2-methoxyphenyl)..."
6362905,C1=CC=C(C=C1)S(=O)(=O)N=C2N(C(=C(S2)C=O)Cl)CCCCl,(NZ)-N-[4-chloro-3-(3-chloropropyl)-5-formyl-1...


In [8]:
# removing indexes

smiles_df = smiles_df.reset_index(drop = True)
smiles_df

Unnamed: 0,CanonicalSMILES,IUPACName
0,CC1=C2C(=CC=C1)C=C(C(=N2)NCCNC(=O)NC3CCCCC3)C#N,1-[2-[(3-cyano-8-methylquinolin-2-yl)amino]eth...
1,CCCCOC(=O)CSC1=NN=C(O1)C2=CC(=C(C(=C2)OC)OC)OC,"butyl 2-[[5-(3,4,5-trimethoxyphenyl)-1,3,4-oxa..."
2,CN1C(=CN=C1C2CC(=O)C3=C(O2)C=CC(=C3)Cl)Cl,"6-chloro-2-(5-chloro-1-methylimidazol-2-yl)-2,..."
3,CCCCCCN1C2=C(CCC2)C(=N)C3=C1CCC3.Br,"2-hexyl-2-azatricyclo[7.3.0.03,7]dodeca-1(9),3..."
4,CC(=O)C1=C(C2=C(S1)N=C(C=C2C(F)(F)F)C3=CC=CS3)N,1-[3-amino-6-thiophen-2-yl-4-(trifluoromethyl)...
...,...,...
124,CNC(=O)COC(=O)C1=CC=CC=C1C(=O)C2=CC(=C(C=C2)Cl...,[2-(methylamino)-2-oxoethyl] 2-(4-chloro-3-nit...
125,CC1=C(C(=NO1)C)COC(=O)C2=CC(=C(N=C2)Cl)Cl,"(3,5-dimethyl-1,2-oxazol-4-yl)methyl 5,6-dichl..."
126,CC1=CC(=CC(=C1)OCC(=O)N2CCN(CC2)C3=CC=CC=C3OC)C,"2-(3,5-dimethylphenoxy)-1-[4-(2-methoxyphenyl)..."
127,C1=CC=C(C=C1)S(=O)(=O)N=C2N(C(=C(S2)C=O)Cl)CCCCl,(NZ)-N-[4-chloro-3-(3-chloropropyl)-5-formyl-1...


In [9]:
# creating a csv of canonical smiles

smiles_df['CanonicalSMILES'].to_csv("alt_inactive_smiles.csv", index = False)

In [10]:
# final dataframe with all necessary features

final_df = pd.concat([df, smiles_df], axis = 1)
final_df

Unnamed: 0,cid,activity,CanonicalSMILES,IUPACName
0,647846,Inactive,CC1=C2C(=CC=C1)C=C(C(=N2)NCCNC(=O)NC3CCCCC3)C#N,1-[2-[(3-cyano-8-methylquinolin-2-yl)amino]eth...
1,649054,Inactive,CCCCOC(=O)CSC1=NN=C(O1)C2=CC(=C(C(=C2)OC)OC)OC,"butyl 2-[[5-(3,4,5-trimethoxyphenyl)-1,3,4-oxa..."
2,657803,Inactive,CN1C(=CN=C1C2CC(=O)C3=C(O2)C=CC(=C3)Cl)Cl,"6-chloro-2-(5-chloro-1-methylimidazol-2-yl)-2,..."
3,6603435,Inactive,CCCCCCN1C2=C(CCC2)C(=N)C3=C1CCC3.Br,"2-hexyl-2-azatricyclo[7.3.0.03,7]dodeca-1(9),3..."
4,661810,Inactive,CC(=O)C1=C(C2=C(S1)N=C(C=C2C(F)(F)F)C3=CC=CS3)N,1-[3-amino-6-thiophen-2-yl-4-(trifluoromethyl)...
...,...,...,...,...
124,2405263,Inactive,CNC(=O)COC(=O)C1=CC=CC=C1C(=O)C2=CC(=C(C=C2)Cl...,[2-(methylamino)-2-oxoethyl] 2-(4-chloro-3-nit...
125,2124454,Inactive,CC1=C(C(=NO1)C)COC(=O)C2=CC(=C(N=C2)Cl)Cl,"(3,5-dimethyl-1,2-oxazol-4-yl)methyl 5,6-dichl..."
126,1357612,Inactive,CC1=CC(=CC(=C1)OCC(=O)N2CCN(CC2)C3=CC=CC=C3OC)C,"2-(3,5-dimethylphenoxy)-1-[4-(2-methoxyphenyl)..."
127,6362905,Inactive,C1=CC=C(C=C1)S(=O)(=O)N=C2N(C(=C(S2)C=O)Cl)CCCCl,(NZ)-N-[4-chloro-3-(3-chloropropyl)-5-formyl-1...


Rearranging features to put 'activity' at the last column

In [11]:
name = final_df.pop('IUPACName')
final_df.insert(1, 'cmpdname', name)

act = final_df.pop('activity')
final_df.insert(3, 'activity', act)
final_df

Unnamed: 0,cid,cmpdname,CanonicalSMILES,activity
0,647846,1-[2-[(3-cyano-8-methylquinolin-2-yl)amino]eth...,CC1=C2C(=CC=C1)C=C(C(=N2)NCCNC(=O)NC3CCCCC3)C#N,Inactive
1,649054,"butyl 2-[[5-(3,4,5-trimethoxyphenyl)-1,3,4-oxa...",CCCCOC(=O)CSC1=NN=C(O1)C2=CC(=C(C(=C2)OC)OC)OC,Inactive
2,657803,"6-chloro-2-(5-chloro-1-methylimidazol-2-yl)-2,...",CN1C(=CN=C1C2CC(=O)C3=C(O2)C=CC(=C3)Cl)Cl,Inactive
3,6603435,"2-hexyl-2-azatricyclo[7.3.0.03,7]dodeca-1(9),3...",CCCCCCN1C2=C(CCC2)C(=N)C3=C1CCC3.Br,Inactive
4,661810,1-[3-amino-6-thiophen-2-yl-4-(trifluoromethyl)...,CC(=O)C1=C(C2=C(S1)N=C(C=C2C(F)(F)F)C3=CC=CS3)N,Inactive
...,...,...,...,...
124,2405263,[2-(methylamino)-2-oxoethyl] 2-(4-chloro-3-nit...,CNC(=O)COC(=O)C1=CC=CC=C1C(=O)C2=CC(=C(C=C2)Cl...,Inactive
125,2124454,"(3,5-dimethyl-1,2-oxazol-4-yl)methyl 5,6-dichl...",CC1=C(C(=NO1)C)COC(=O)C2=CC(=C(N=C2)Cl)Cl,Inactive
126,1357612,"2-(3,5-dimethylphenoxy)-1-[4-(2-methoxyphenyl)...",CC1=CC(=CC(=C1)OCC(=O)N2CCN(CC2)C3=CC=CC=C3OC)C,Inactive
127,6362905,(NZ)-N-[4-chloro-3-(3-chloropropyl)-5-formyl-1...,C1=CC=C(C=C1)S(=O)(=O)N=C2N(C(=C(S2)C=O)Cl)CCCCl,Inactive


Now activity column is the last

In [12]:
# saving the final_df as a .csv file

final_df.to_csv("alt_inactive_final.csv", index = False)