In [1]:
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import csv

DATA = os.path.abspath("../data")
INPUT_PATH = os.path.join(DATA, "generated")

def read_smiles(file_name):
    with open(file_name, "r") as f:
        reader = csv.reader(f)
        smiles = []
        for r in reader:
            smiles += [r[0]]
    return smiles

smiles = read_smiles(os.path.join(INPUT_PATH, "known_hits.csv"))

In [15]:
import collections
import urllib
import json

from smallworld_api import SmallWorld

import warnings
warnings.filterwarnings("ignore")


def get_available_maps():
    url = "https://sw.docking.org/search/maps"
    response = urllib.request.urlopen(url)
    data = json.loads(response.read())
    return data


def get_maps():
    data = get_available_maps()
    labels = ["REAL", "WuXi", "MCule", "Zinc"]
    found_maps = collections.defaultdict(list)
    for k,v in data.items():
        for l in labels:
            l_ = l.lower()
            k_ = k.lower()
            if l_ in k_:
                found_maps[l] += [k]
    found_maps_ = {}
    for k,v in found_maps.items():
        if len(v) == 1:
            v_ = v[0]
            if data[v_]["enabled"] and data[v_]["status"] == "Available":
                found_maps_[k] = v_
        else:
            v_sel = None
            w_sel = None
            for v_ in v:
                if not data[v_]["enabled"] or data[v_]["status"] != "Available":
                    continue
                w_ = data[v_]["numEntries"]
                if v_sel is None:
                    v_sel = v_
                    w_sel = w_
                else:
                    if w_ > w_sel:
                        v_sel = v_
                        w_sel = w_
            if v_sel is not None:
                found_maps_[k] = v_sel
    result = []
    for l in labels:
        if l in found_maps_:
            result += [(l, found_maps_[l])]
    return result

In [18]:
from tqdm import tqdm
import time


class SmallWorldSampler(object):
    
    def __init__(self, dist=10):
        self.maps = get_maps()
        self.sw = SmallWorld()
        self.dist = dist
        
    def _sample(self, smiles, time_budget_sec):
        t0 = time.time()
        sampled_smiles = []
        for m in self.maps:
            try:
                db_name = m[1]
                results : pd.DataFrame = sw.search(smiles, dist=self.dist, db=db_name)
            except:
                results = None
            if results is not None:
                sampled_smiles += list(results["smiles"])
            t1 = time.time()
            if (t1-t0) > time_budget_sec:
                break
            t0 = time.time() 
        return sampled_smiles
            
    def sample(self, smiles_list):
        sampled_smiles = []
        for smi in tqdm(smiles_list):
            sampled_smiles += self._sample(smi, 1)
        return sampled_smiles

    
smp = SmallWorldSampler()
smp.sample(smiles)

100%|██████████████████████████████████████████████████████████████████████████████████████| 16/16 [02:03<00:00,  7.72s/it]


['C=C1CCC(C(=O)NC2=CC=C(Br)C=N2)CC1',
 'C=C1CCC(C(=O)NC2=CC=C(Br)N=C2)CC1',
 'C=C1CCC(C(=O)NC2=CC=C(Br)N=N2)CC1',
 'C=C1CCC(C(=O)NC2=CC=C(C)C=C2)CC1',
 'C=C1CCC(C(=O)NC2=CC=C(C)C=N2)CC1',
 'C=C1CCC(C(=O)NC2=CC=C(C)N=C2)CC1',
 'C=C1CCC(C(=O)NC2=CC=C(C)N=N2)CC1',
 'C=C1CCC(C(=O)NC2=CC=C(Cl)C=N2)CC1',
 'C=C1CCC(C(=O)NC2=CC=C(Cl)N=C2)CC1',
 'C=C1CCC(C(=O)NC2=CC=C(Cl)N=N2)CC1',
 'C#CC1=CC=C(NC(=O)C2=CC=C(Br)C=C2)N=C1C',
 'C#CC1=CC=C(NC(=O)C2=CC=C(Br)C=C2)N=C1F',
 'C#CC1=CC=C(NC(=O)C2=CC=C(Br)C=N2)N=C1C',
 'C#CC1=CC=C(NC(=O)C2=CC=C(Br)C=N2)N=C1F',
 'C#CC1=CC=C(NC(=O)C2=CC=C(C)C=C2)C=C1Cl',
 'C#CC1=CC=C(NC(=O)C2=CC=C(C)C=C2)N=C1C',
 'C#CC1=CC=C(NC(=O)C2=CC=C(C)C=C2)N=C1F',
 'C#CC1=CC=C(NC(=O)C2=CC=C(C)C=N2)C=C1Cl',
 'C#CC1=CC=C(NC(=O)C2=CC=C(C)C=N2)N=C1C',
 'C#CC1=CC=C(NC(=O)C2=CC=C(C)C=N2)N=C1F',
 'C=C1CCN(C(=O)NC2=CC=CC=C2N3CCC4=CC=CC=C4C3)CC1',
 'CC1=CC=C(C(=O)NC2=CC=CC=C2N3CCC4=CC=CC=C4C3)C=C1',
 'CC1=CC=C(C(=O)NC2=CC=CC=C2N3CCC4=CC=CC=C4C3)C=N1',
 'CC1=CC=C(C(=O)NC2=CC=CC=C2N3CCC4=CC=CC=

In [16]:
sw = SmallWorld()
sw.search("O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1", dist=10, db="WuXi-20Q4.smi.anon")

Unnamed: 0,id,hitSmiles,qrySmiles,qryMappedSmiles,hitMappedSmiles,atomMap,atomScore,anonIdx,mf,mw,...,rup,ldn,lup,mut,maj,min,hyb,sub,name,smiles
0,WXVL_BT2125LQ1899,CN(C)C(=O)CC1=CC=C(C2=C3C=CC=CN3N=C2)C=C1 WXVL...,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,CN(C)C(=O)CC1=CC=C(C2=C3C=CC=CN3N=C2)C=C1,[],[],B23R3.48752,C<sub>17</sub>H<sub>17</sub>N<sub>3</sub>O,279.337,...,0,0,0,,,,,,WXVL_BT2125LQ1899,CN(C)C(=O)CC1=CC=C(C2=C3C=CC=CN3N=C2)C=C1
1,WXVL_BT1640LQ1899,CN(C)C(=O)CC1=CC=C(C2=C3N=CC=CN3N=C2)C=C1 WXVL...,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,CN(C)C(=O)CC1=CC=C(C2=C3N=CC=CN3N=C2)C=C1,[],[],B23R3.48752,C<sub>16</sub>H<sub>16</sub>N<sub>4</sub>O,280.325,...,0,0,0,,,,,,WXVL_BT1640LQ1899,CN(C)C(=O)CC1=CC=C(C2=C3N=CC=CN3N=C2)C=C1
2,WXVL_BT2046LQ1899,CN(C)C(=O)CC1=CC=C(C2=CNC3=C2C=CC=N3)C=C1 WXVL...,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,CN(C)C(=O)CC1=CC=C(C2=CNC3=C2C=CC=N3)C=C1,[],[],B23R3.48752,C<sub>17</sub>H<sub>17</sub>N<sub>3</sub>O,279.337,...,0,0,0,,,,,,WXVL_BT2046LQ1899,CN(C)C(=O)CC1=CC=C(C2=CNC3=C2C=CC=N3)C=C1
3,WXVL_BT1935LQ1899,CN(C)C(=O)CC1=CC=C(C2=COC3=C2C=CC=C3)C=C1 WXVL...,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,CN(C)C(=O)CC1=CC=C(C2=COC3=C2C=CC=C3)C=C1,[],[],B23R3.48752,C<sub>18</sub>H<sub>17</sub>NO<sub>2</sub>,279.334,...,0,0,0,,,,,,WXVL_BT1935LQ1899,CN(C)C(=O)CC1=CC=C(C2=COC3=C2C=CC=C3)C=C1
4,WXVL_BT1730LQ1899,CN(C)C(=O)CC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1 WXVL...,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,CN(C)C(=O)CC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,[],[],B23R3.48752,C<sub>18</sub>H<sub>17</sub>NOS,295.4,...,0,0,0,,,,,,WXVL_BT1730LQ1899,CN(C)C(=O)CC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1
5,WXVL_BT2125LQ3145,CNC(=O)CC1=CC=C(C2=C3C=CC=CN3N=C2)C=C1 WXVL_BT...,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,CNC(=O)CC1=CC=C(C2=C3C=CC=CN3N=C2)C=C1,[],[],B22R3.7339800,C<sub>16</sub>H<sub>15</sub>N<sub>3</sub>O,265.31,...,0,0,0,,,,,,WXVL_BT2125LQ3145,CNC(=O)CC1=CC=C(C2=C3C=CC=CN3N=C2)C=C1
6,WXVL_BT1640LQ3145,CNC(=O)CC1=CC=C(C2=C3N=CC=CN3N=C2)C=C1 WXVL_BT...,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,CNC(=O)CC1=CC=C(C2=C3N=CC=CN3N=C2)C=C1,[],[],B22R3.7339800,C<sub>15</sub>H<sub>14</sub>N<sub>4</sub>O,266.298,...,0,0,0,,,,,,WXVL_BT1640LQ3145,CNC(=O)CC1=CC=C(C2=C3N=CC=CN3N=C2)C=C1
7,WXVL_BT2046LQ3145,CNC(=O)CC1=CC=C(C2=CNC3=C2C=CC=N3)C=C1 WXVL_BT...,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,CNC(=O)CC1=CC=C(C2=CNC3=C2C=CC=N3)C=C1,[],[],B22R3.7339800,C<sub>16</sub>H<sub>15</sub>N<sub>3</sub>O,265.31,...,0,0,0,,,,,,WXVL_BT2046LQ3145,CNC(=O)CC1=CC=C(C2=CNC3=C2C=CC=N3)C=C1
8,WXVL_BT1935LQ3145,CNC(=O)CC1=CC=C(C2=COC3=C2C=CC=C3)C=C1 WXVL_BT...,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,CNC(=O)CC1=CC=C(C2=COC3=C2C=CC=C3)C=C1,[],[],B22R3.7339800,C<sub>17</sub>H<sub>15</sub>NO<sub>2</sub>,265.307,...,0,0,0,,,,,,WXVL_BT1935LQ3145,CNC(=O)CC1=CC=C(C2=COC3=C2C=CC=C3)C=C1
9,WXVL_BT1730LQ3145,CNC(=O)CC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1 WXVL_BT...,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,O=C(C(C)C)NC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,CNC(=O)CC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1,[],[],B22R3.7339800,C<sub>17</sub>H<sub>15</sub>NOS,281.374,...,0,0,0,,,,,,WXVL_BT1730LQ3145,CNC(=O)CC1=CC=C(C2=CSC3=C2C=CC=C3)C=C1


In [None]:
from rdkit import Chem
from rdkit.Chem import PandasTools
import pandas as pd  # for typehinting below

from smallworld_api import SmallWorld

sw = SmallWorld()
results : pd.DataFrame = sw.search(smiles, dist=5, db=map_name)

In [None]:
results