## Cleaning Alt Actives
The alt active dataset is cleaned and canonical smiles are extracted

In [1]:
# importing libraries

import pandas as pd
import numpy as np
import pubchempy as pcp
import time

In [2]:
# reading and importing active dataset

df = pd.read_csv("alt_active.csv")
df

Unnamed: 0,sid,cid,sidsrcname,sidextid,subssynonym,sidmdate,depdate,depcatg,annotation,activity
0,851380,653949,MLSMR,MLS000028188,MLS000028188|4-Chloro-7-ethylamino-chromen-2-o...,20120301,20050604,Governmental Organizations|NIH Initiatives,,Active
1,855880,969516,MLSMR,MLS000069631,curcumin|MLS000069631|SMR000058237,20210728,20050629,Governmental Organizations|NIH Initiatives,,Active
2,856745,658104,MLSMR,MLS000069179,MLS000069179|SMR000039036,20190115,20050629,Governmental Organizations|NIH Initiatives,,Active
3,857055,100641,MLSMR,MLS000080824,MLS000080824|SMR000034073,20120301,20050629,Governmental Organizations|NIH Initiatives,,Active
4,857094,629074,MLSMR,MLS000037792,MLS000037792|SMR000039459,20210728,20050629,Governmental Organizations|NIH Initiatives,,Active
...,...,...,...,...,...,...,...,...,...,...
215,24833322,2463883,MLSMR,MLS000761068,MLS000761068|SMR000372367,20210728,20070705,Governmental Organizations|NIH Initiatives,,Active
216,24834596,3934060,MLSMR,MLS000772776,MLS000772776|SMR000377356,20190115,20070705,Governmental Organizations|NIH Initiatives,,Active
217,24836078,135440580,MLSMR,MLS000760005,MLS000760005|SMR000370107,20210728,20070705,Governmental Organizations|NIH Initiatives,,Active
218,24841413,6451074,MLSMR,MLS000876965,MLS000876965|SMR000440628,20120301,20070705,Governmental Organizations|NIH Initiatives,,Active


In [3]:
# checking for missing values in all columns

df.isnull().sum()

sid              0
cid              0
sidsrcname       0
sidextid         0
subssynonym      0
sidmdate         0
depdate          0
depcatg          0
annotation     220
activity         0
dtype: int64

In [4]:
# checking all column names

df.columns

Index(['sid', 'cid', 'sidsrcname', 'sidextid', 'subssynonym', 'sidmdate',
       'depdate', 'depcatg', 'annotation', 'activity'],
      dtype='object')

In [5]:
# removing 'unecessary' columns

df = df.drop(['sid', 'sidsrcname', 'sidextid', 'subssynonym', 'sidmdate', 'depdate', 'depcatg', 'annotation'], axis = 'columns')
df

Unnamed: 0,cid,activity
0,653949,Active
1,969516,Active
2,658104,Active
3,100641,Active
4,629074,Active
...,...,...
215,2463883,Active
216,3934060,Active
217,135440580,Active
218,6451074,Active


In [6]:
# extracting canonical smiles and compound names from pubchem database

cid_list = np.array(df.cid)

start = time.time()

smiles_df = pcp.get_properties(['CanonicalSMILES', 'IUPACname'], np.nditer(cid_list), 'cid', as_dataframe = True)

end = time.time()

print("The time take in seconds for extraction of canonical smiles is :  ", end-start)
smiles_df

The time take in seconds for extraction of canonical smiles is :   1.9265408515930176


Unnamed: 0_level_0,CanonicalSMILES,IUPACName
CID,Unnamed: 1_level_1,Unnamed: 2_level_1
653949,CCNC1=CC2=C(C=C1)C(=CC(=O)O2)Cl,4-chloro-7-(ethylamino)chromen-2-one
969516,COC1=C(C=CC(=C1)C=CC(=O)CC(=O)C=CC2=CC(=C(C=C2...,"(1E,6E)-1,7-bis(4-hydroxy-3-methoxyphenyl)hept..."
658104,COC(=O)C1=CC=C(O1)C2=C(C3=C(S2)N=C(C4=C3CCCC4)...,"methyl 5-(1-amino-5-morpholin-4-yl-6,7,8,9-tet..."
100641,C1=CC2=C(C=C1N)OC(=O)C=C2C(F)(F)F,7-amino-4-(trifluoromethyl)chromen-2-one
629074,C1=CC=C(C=C1)C2=NC3=C(C(=C2)C4=CC=CO4)C(=C(S3)...,"3-amino-4-(furan-2-yl)-6-phenylthieno[2,3-b]py..."
...,...,...
2463883,CC1=CC(=CC=C1)N2C(=O)C3=CC=CC=C3C(=CC4=CC=C(C=...,"4-[(Z)-[2-(3-methylphenyl)-1,3-dioxoisoquinoli..."
3934060,CC(C)CN1C2=C(C(=O)NC1=O)N(C(=N2)C3=CC=C(O3)C4=...,8-[5-(2-bromo-4-nitrophenyl)furan-2-yl]-7-(2-m...
135440580,CC(=NNC1=CC=C(C=C1)[N+](=O)[O-])C2=CC=C(C=C2)O,4-[(E)-C-methyl-N-(4-nitroanilino)carbonimidoy...
6451074,CCC(C)C=C(C)C=CC1=CC2=CC(=O)C3(C(C2=CO1)C(C(=O...,"(6aR,9R,9aR)-9-acetyl-3-[(1E,3E,5S)-3,5-dimeth..."


In [7]:
# removing indexes

smiles_df = smiles_df.reset_index(drop = True)
smiles_df

Unnamed: 0,CanonicalSMILES,IUPACName
0,CCNC1=CC2=C(C=C1)C(=CC(=O)O2)Cl,4-chloro-7-(ethylamino)chromen-2-one
1,COC1=C(C=CC(=C1)C=CC(=O)CC(=O)C=CC2=CC(=C(C=C2...,"(1E,6E)-1,7-bis(4-hydroxy-3-methoxyphenyl)hept..."
2,COC(=O)C1=CC=C(O1)C2=C(C3=C(S2)N=C(C4=C3CCCC4)...,"methyl 5-(1-amino-5-morpholin-4-yl-6,7,8,9-tet..."
3,C1=CC2=C(C=C1N)OC(=O)C=C2C(F)(F)F,7-amino-4-(trifluoromethyl)chromen-2-one
4,C1=CC=C(C=C1)C2=NC3=C(C(=C2)C4=CC=CO4)C(=C(S3)...,"3-amino-4-(furan-2-yl)-6-phenylthieno[2,3-b]py..."
...,...,...
215,CC1=CC(=CC=C1)N2C(=O)C3=CC=CC=C3C(=CC4=CC=C(C=...,"4-[(Z)-[2-(3-methylphenyl)-1,3-dioxoisoquinoli..."
216,CC(C)CN1C2=C(C(=O)NC1=O)N(C(=N2)C3=CC=C(O3)C4=...,8-[5-(2-bromo-4-nitrophenyl)furan-2-yl]-7-(2-m...
217,CC(=NNC1=CC=C(C=C1)[N+](=O)[O-])C2=CC=C(C=C2)O,4-[(E)-C-methyl-N-(4-nitroanilino)carbonimidoy...
218,CCC(C)C=C(C)C=CC1=CC2=CC(=O)C3(C(C2=CO1)C(C(=O...,"(6aR,9R,9aR)-9-acetyl-3-[(1E,3E,5S)-3,5-dimeth..."


In [8]:
# creating a csv of canonical smiles

smiles_df['CanonicalSMILES'].to_csv("alt_active_smiles.csv", index = False)

In [9]:
# final dataframe with all necessary features

final_df = pd.concat([df, smiles_df], axis = 1)
final_df

Unnamed: 0,cid,activity,CanonicalSMILES,IUPACName
0,653949,Active,CCNC1=CC2=C(C=C1)C(=CC(=O)O2)Cl,4-chloro-7-(ethylamino)chromen-2-one
1,969516,Active,COC1=C(C=CC(=C1)C=CC(=O)CC(=O)C=CC2=CC(=C(C=C2...,"(1E,6E)-1,7-bis(4-hydroxy-3-methoxyphenyl)hept..."
2,658104,Active,COC(=O)C1=CC=C(O1)C2=C(C3=C(S2)N=C(C4=C3CCCC4)...,"methyl 5-(1-amino-5-morpholin-4-yl-6,7,8,9-tet..."
3,100641,Active,C1=CC2=C(C=C1N)OC(=O)C=C2C(F)(F)F,7-amino-4-(trifluoromethyl)chromen-2-one
4,629074,Active,C1=CC=C(C=C1)C2=NC3=C(C(=C2)C4=CC=CO4)C(=C(S3)...,"3-amino-4-(furan-2-yl)-6-phenylthieno[2,3-b]py..."
...,...,...,...,...
215,2463883,Active,CC1=CC(=CC=C1)N2C(=O)C3=CC=CC=C3C(=CC4=CC=C(C=...,"4-[(Z)-[2-(3-methylphenyl)-1,3-dioxoisoquinoli..."
216,3934060,Active,CC(C)CN1C2=C(C(=O)NC1=O)N(C(=N2)C3=CC=C(O3)C4=...,8-[5-(2-bromo-4-nitrophenyl)furan-2-yl]-7-(2-m...
217,135440580,Active,CC(=NNC1=CC=C(C=C1)[N+](=O)[O-])C2=CC=C(C=C2)O,4-[(E)-C-methyl-N-(4-nitroanilino)carbonimidoy...
218,6451074,Active,CCC(C)C=C(C)C=CC1=CC2=CC(=O)C3(C(C2=CO1)C(C(=O...,"(6aR,9R,9aR)-9-acetyl-3-[(1E,3E,5S)-3,5-dimeth..."


Rearranging features to put 'activity' at the last column

In [11]:
name = final_df.pop('IUPACName')
final_df.insert(1, 'cmpdname', name)

act = final_df.pop('activity')
final_df.insert(3, 'activity', act)
final_df

Unnamed: 0,cid,cmpdname,CanonicalSMILES,activity
0,653949,4-chloro-7-(ethylamino)chromen-2-one,CCNC1=CC2=C(C=C1)C(=CC(=O)O2)Cl,Active
1,969516,"(1E,6E)-1,7-bis(4-hydroxy-3-methoxyphenyl)hept...",COC1=C(C=CC(=C1)C=CC(=O)CC(=O)C=CC2=CC(=C(C=C2...,Active
2,658104,"methyl 5-(1-amino-5-morpholin-4-yl-6,7,8,9-tet...",COC(=O)C1=CC=C(O1)C2=C(C3=C(S2)N=C(C4=C3CCCC4)...,Active
3,100641,7-amino-4-(trifluoromethyl)chromen-2-one,C1=CC2=C(C=C1N)OC(=O)C=C2C(F)(F)F,Active
4,629074,"3-amino-4-(furan-2-yl)-6-phenylthieno[2,3-b]py...",C1=CC=C(C=C1)C2=NC3=C(C(=C2)C4=CC=CO4)C(=C(S3)...,Active
...,...,...,...,...
215,2463883,"4-[(Z)-[2-(3-methylphenyl)-1,3-dioxoisoquinoli...",CC1=CC(=CC=C1)N2C(=O)C3=CC=CC=C3C(=CC4=CC=C(C=...,Active
216,3934060,8-[5-(2-bromo-4-nitrophenyl)furan-2-yl]-7-(2-m...,CC(C)CN1C2=C(C(=O)NC1=O)N(C(=N2)C3=CC=C(O3)C4=...,Active
217,135440580,4-[(E)-C-methyl-N-(4-nitroanilino)carbonimidoy...,CC(=NNC1=CC=C(C=C1)[N+](=O)[O-])C2=CC=C(C=C2)O,Active
218,6451074,"(6aR,9R,9aR)-9-acetyl-3-[(1E,3E,5S)-3,5-dimeth...",CCC(C)C=C(C)C=CC1=CC2=CC(=O)C3(C(C2=CO1)C(C(=O...,Active


Now activity column is the last

In [12]:
# saving the final_df as a .csv file

final_df.to_csv("alt_active_final.csv", index = False)