In [141]:
import chemfp
from chemfp import search
import bclus
import pandas as pd
import time
import sys

In [84]:
# Import SMILES into a dataframe

smi = pd.read_csv('../data/z310k.smi', delim_whitespace = True, names = ['smiles','id'], header = None)

In [149]:
# Generate RDKit FPs

import subprocess as sp

start = time.time()

sp.call(['rdkit2fps', '../data/z310k.smi', '-o', 'z310k.fps'])

end = time.time()

elapsed_time = end - start
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

00:09:43


In [150]:
# Load the FPs into an arena

try:
    arena = chemfp.load_fingerprints('./z310k.fps')
except IOError as err:
    sys.stderr.write("Cannot open fingerprint file: %s" % (err,))
    raise SystemExit(2)

In [152]:
# Generate the similarity table
start = time.time()

similarity_table = search.threshold_tanimoto_search_symmetric(arena, threshold = 0.8)

end = time.time()

elapsed_time = end - start
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

00:01:26


In [153]:
# Cluster the data

clus_res = bclus.taylor_butina_cluster(similarity_table)

In [230]:
# Generate dataframe with results

cldf = pd.DataFrame(columns = ['clid','id','centryn','smiles'])

In [250]:
# Method by filling previously created NaNs dataframe

start = time.time()

i = 0

cldf2 = pd.DataFrame(index = range(len(smi)), columns = ['clid','id','centryn','smiles'])

for clid in range(len(clus_res.clusters)):
    cent_id = clus_res.clusters[clid][0] 
    cldf2.loc[i] = [clid+1,cent_id,1,smi[smi['id']==cent_id]['smiles']]
    i = i+1
    mems = list(clus_res.clusters[clid][1])
    for mem in mems:
        cldf2.loc[i] = [clid+1,mem,0,smi[smi['id']==mem]['smiles']]
        i = i+1
        
end = time.time()

elapsed_time = end - start
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

01:28:45


In [231]:
# Method by concats

start = time.time()

i = 0

for clid in range(len(clus_res.clusters)):
    cent_id = clus_res.clusters[clid][0] 
    pd.DataFrame([[clid+1, mem, 0, 'cc1']], columns = ['clid','id','centryn','smiles'])
    i = i+1
    mems = list(clus_res.clusters[clid][1])
    for mem in mems:
        line = pd.DataFrame([[clid+1, mem, 0, 'cc1']], columns = ['clid','id','centryn','smiles'])
        cldf = pd.concat([cldf, line])
        i = i+1
        
end = time.time()

elapsed_time = end - start
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

01:14:12


In [224]:
pd.DataFrame([[clid+1, mem, 0, 'cc1']], columns = ['clid','id','centryn','smiles'])

Unnamed: 0,clid,id,centryn,smiles
0,3,147454,0,cc1


In [240]:
clid = 0
cent_id = clus_res.clusters[clid][0] 
cent_id
cldf2.loc[0] = [clid+1,cent_id,1,'cc']

In [241]:
cldf2.head()

Unnamed: 0,clid,id,centryn,smiles
0,1.0,176706.0,1.0,cc
1,,,,
2,,,,
3,,,,
4,,,,


In [248]:
smi[smi['id']==cent_id]['smiles']

176705    Cn1c(=O)n(C)c2cc(C=NO)ccc21
Name: smiles, dtype: object

In [252]:
cldf2.head()

Unnamed: 0,clid,id,centryn,smiles
0,1,176706,1,cc
1,1,167078,0,c1c1
2,1,187051,0,c1c1
3,1,239958,0,c1c1
4,1,213002,0,c1c1


In [259]:
smi_lst = [smi[smi['id']==x]['smiles'] for x in [cldf2['id']]]

In [264]:
cldf2['smiles']=[smi[smi['id']==x]['smiles'] for x in cldf2['id']]

In [265]:
cldf2.head()

Unnamed: 0,clid,id,centryn,smiles
0,1,176706,1,176705 Cn1c(=O)n(C)c2cc(C=NO)ccc21 Name: sm...
1,1,167078,0,167077 COCCN(CC(=O)O)C(=O)CC1CCCC1NC(=O)OCC...
2,1,187051,0,187050 CCOc1cc(C=C(Sc2nnc(-c3cc(OC)cc(OC)c3...
3,1,239958,0,239957 Cc1cc(S(=O)(=O)N2CCCCC2)ccc1OCC(=O)N...
4,1,213002,0,213001 C=CCNC(=O)CSc1nnc(-c2ccccc2F)n1C1CCC...


In [267]:
smi['smiles'].iloc[176705]

'Cn1c(=O)n(C)c2cc(C=NO)ccc21'

In [270]:
smi[smi['id']==176705][['smiles']]

Unnamed: 0,smiles
176704,O=C(Cn1cc(OCc2ccccc2F)c(=O)cc1CO)N1CCCC1


In [271]:
start = time.time()

[smi[smi['id']==x][['smiles']] for x in cldf2['id']]

end = time.time()

elapsed_time = end - start
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

00:08:31


In [274]:
start = time.time()

[smi['smiles'].iloc[x] for x in cldf2['id']]

end = time.time()

elapsed_time = end - start
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

TypeError: cannot do positional indexing on <class 'pandas.core.indexes.range.RangeIndex'> with these indexers [nan] of <type 'float'>

In [275]:
cldf2['id']

0         176706
1         167078
2         187051
3         239958
4         213002
5         193197
6         172049
7         215058
8         184340
9         213016
10        181822
11        188442
12        192175
13        167965
14        241694
15        187948
16        198041
17        192548
18        198697
19        192554
20        181255
21        213036
22        200751
23        194608
24        184372
25        172085
26        192567
27        172096
28        213344
29        235589
           ...  
309970       NaN
309971       NaN
309972       NaN
309973       NaN
309974       NaN
309975       NaN
309976       NaN
309977       NaN
309978       NaN
309979       NaN
309980       NaN
309981       NaN
309982       NaN
309983       NaN
309984       NaN
309985       NaN
309986       NaN
309987       NaN
309988       NaN
309989       NaN
309990       NaN
309991       NaN
309992       NaN
309993       NaN
309994       NaN
309995       NaN
309996       NaN
309997       N