In [141]:
import chemfp
from chemfp import search
import bclus
import pandas as pd
import time
import sys

In [84]:
# Import SMILES into a dataframe

smi = pd.read_csv('../data/z310k.smi', delim_whitespace = True, names = ['smiles','id'], header = None)

In [149]:
# Generate RDKit FPs

import subprocess as sp

start = time.time()

sp.call(['rdkit2fps', '../data/z310k.smi', '-o', 'z310k.fps'])

end = time.time()

elapsed_time = end - start
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

00:09:43


In [150]:
# Load the FPs into an arena

try:
    arena = chemfp.load_fingerprints('./z310k.fps')
except IOError as err:
    sys.stderr.write("Cannot open fingerprint file: %s" % (err,))
    raise SystemExit(2)

In [152]:
# Generate the similarity table
start = time.time()

similarity_table = search.threshold_tanimoto_search_symmetric(arena, threshold = 0.8)

end = time.time()

elapsed_time = end - start
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

00:01:26


In [153]:
# Cluster the data

clus_res = bclus.taylor_butina_cluster(similarity_table)

In [230]:
# Generate dataframe with results

cldf = pd.DataFrame(columns = ['clid','id','centryn','smiles'])

In [None]:
# Method by filling previously created NaNs dataframe

start = time.time()

i = 0

cldf2 = pd.DataFrame(index = range(len(smi)), columns = ['clid','id','centryn','smiles'])

for clid in range(len(clus_res.clusters)):
    cent_id = clus_res.clusters[clid][0] 
    cldf2.loc[i] = [clid+1,cent_id,1,'cc']
    i = i+1
    mems = list(clus_res.clusters[clid][1])
    for mem in mems:
        cldf2.loc[i] = [clid+1,mem,0,'c1c1']
        i = i+1
        
end = time.time()

elapsed_time = end - start
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

In [231]:
# Method by concats

start = time.time()

i = 0

for clid in range(len(clus_res.clusters)):
    cent_id = clus_res.clusters[clid][0] 
    pd.DataFrame([[clid+1, mem, 0, 'cc1']], columns = ['clid','id','centryn','smiles'])
    i = i+1
    mems = list(clus_res.clusters[clid][1])
    for mem in mems:
        line = pd.DataFrame([[clid+1, mem, 0, 'cc1']], columns = ['clid','id','centryn','smiles'])
        cldf = pd.concat([cldf, line])
        i = i+1
        
end = time.time()

elapsed_time = end - start
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

01:14:12


In [224]:
pd.DataFrame([[clid+1, mem, 0, 'cc1']], columns = ['clid','id','centryn','smiles'])

Unnamed: 0,clid,id,centryn,smiles
0,3,147454,0,cc1


In [240]:
clid = 0
cent_id = clus_res.clusters[clid][0] 
cent_id
cldf2.loc[0] = [clid+1,cent_id,1,'cc']

In [241]:
cldf2.head()

Unnamed: 0,clid,id,centryn,smiles
0,1.0,176706.0,1.0,cc
1,,,,
2,,,,
3,,,,
4,,,,


In [248]:
smi[smi['id']==cent_id]['smiles']

176705    Cn1c(=O)n(C)c2cc(C=NO)ccc21
Name: smiles, dtype: object