This is the third part of iSynthesis tutorial

# Collect building blocks from [ZINC](https://zinc12.docking.org/browse/catalogs/building-blocks) database


Your project should have this structure:

    project
    __tutorial // you are here
    __data
    ____rules
    ____zinc
    ____standardized_zinc


In [1]:
from bs4 import BeautifulSoup
from urllib.error import HTTPError
from pickle import dump, load
import requests
import wget

## Parse webpage and get archives of data

In [2]:
URL = 'https://zinc12.docking.org/browse/catalogs/building-blocks'

page = requests.get(URL)

print(page.status_code)

soup = BeautifulSoup(page.text, "html.parser")

raw_data = soup.findAll('tr', 'catalog')

data = []
for line in raw_data:
    lines = line.find_all('td')
    data.append((lines[1].a.get('title'),
                 lines[9].a.get('href'),
                 lines[9].find_all('a')[1].get('href')))

200


In [3]:
for t in data:
    print(t[0])
    try:
        wget.download(f'https:{t[1]}', out=f'../data/zinc/{t[0]}.gz')
    except HTTPError as e:
        print(f'https:{t[1]}', e)

ZINC Catalog page for ACB Blocks
ZINC Catalog page for ACT Chemical BB
ZINC Catalog page for Alinda Building Blocks
ZINC Catalog page for AnalytiCon Discovery NP BB
ZINC Catalog page for Angene Building Blocks
ZINC Catalog page for Anward
ZINC Catalog page for Apexmol Building Blocks
ZINC Catalog page for Ark Pharm Building Blocks
ZINC Catalog page for Aromalake BB
ZINC Catalog page for Aromsyn
https://zinc12.docking.org/db/byvendor/aromsynbb/aromsynbb_p0.smi.gz HTTP Error 403: Forbidden
ZINC Catalog page for Aronis BuildingBlocks
ZINC Catalog page for Asinex Building Blocks
https://zinc12.docking.org/db/byvendor/asinbb/asinbb_p0.smi.gz HTTP Error 403: Forbidden
ZINC Catalog page for AsisChem Building Blocks
ZINC Catalog page for AZEPINE
ZINC Catalog page for Beijing Acemol Technology
ZINC Catalog page for BePharm Building Blocks
ZINC Catalog page for Bide Pharmatech BB
ZINC Catalog page for BioBlocks BB
ZINC Catalog page for BOC Sciences BB
ZINC Catalog page for Boerchem Pharmatech BB

## Extract archives and prepare data

##### Rename files, remove redundant information from filenames:

In [4]:
!cd ../data/zinc && for d in *\ *; do mv "$d" "${d// /}"; done; for d in *; do mv "$d" "${d//ZINCCatalogpagefor/}"; done

##### Extract SMILES from downloaded archives:

In [5]:
!cd ../data/zinc && for i in * ; do gzip -d $i ; done

## Read from SMILES and standardize molecules

In [6]:
import os
from CGRtools import SMILESRead
from collections import defaultdict
from pickle import load, dump

In [7]:
RAW_ZINC = '../data/zinc'
STANDARDIZED_ZINC_BY_NAMES = '../data/standardized_zinc'
STANDARDIZED_ALL = '../data/standardized_zinc/zinc.smi'
N_CPU = 16

In [8]:
files = [x for x in os.listdir(RAW_ZINC)]
files[:5]

['MicroCombiChemBB',
 'Vitas-MBB',
 'SigmaAldrich(BuildingBlocks)',
 'SynQuestBuildingBlocks',
 'ChemDivBuildingBlocks']

#### Standardize structures in parallel

In [9]:
from multiprocessing.managers import BaseManager, DictProxy 
from multiprocessing import Queue, Process, Manager

class MyManager(BaseManager):
    pass

MyManager.register('defaultdict', defaultdict, DictProxy)

In [10]:
def worker(i, d):
    n = 0
    for name in iter(i.get, 'STOP'):
        n += 1
        mols = SMILESRead(f'{RAW_ZINC}/{name}', ignore_stereo=True).read()
        
        smiles = []
        with open(f'{STANDARDIZED_ZINC_BY_NAMES}/{name}.smi', 'w') as f:
            for molecule in mols:
                try:
                    molecule.standardize()
                    molecule.canonicalize()
                    smiles.append(str(molecule))
                    
                    f.write(f'{str(molecule)}\n')
                except Exception as e:
                    ...
        d[name] = set(smiles)
                    
        print(f'{name} - before: ', len(mols), '- after: ', len(d[name]))

    print(n, 'PROCESS DONE')

if __name__ == "__main__":
    with MyManager() as manager:
        data = manager.defaultdict(set)
        q = Queue()
        # ps = [Process(target=worker, args=(q, bbsig)) for _ in range(20)]
        ps = []
        for name in os.listdir(RAW_ZINC):
            q.put(name)

        for _ in range(N_CPU):
            q.put('STOP')
        for _ in range(N_CPU):
            p = Process(target=worker, args=(q, data))
            p.start()
            ps.append(p)
        for p in ps:
            p.join()

        # dump standardized building blocks with vendor name
        with open(f'{STANDARDIZED_ZINC_BY_NAMES}.pickle', 'wb') as f:
            dump(dict(data), f)


ProVenceBuildingBlocks - before:  484 - after:  473
MicroCombiChemBB - before:  987 - after:  772
BeijingAcemolTechnology - before:  1394 - after:  1260
AronisBuildingBlocks - before:  2083 - after:  1640
PKChemBuildingBlocks - before:  2918 - after:  2518
FocusSynthesisBB - before:  860 - after:  792
ACBBlocks - before:  4397 - after:  3286
StruChemBB - before:  6402 - after:  5258
ChemikBuildingBlocks - before:  3630 - after:  3511
AnalytiConDiscoveryNPBB - before:  213 - after:  206
SynergyScientificBB - before:  7616 - after:  7044
ChemicalBlockBB - before:  6884 - after:  5755
BioBlocksBB - before:  6751 - after:  5686
KeyOrganicsBuildingBlocks - before:  17281 - after:  15437
AngeneBuildingBlocks - before:  22812 - after:  20816
ChemDivBuildingBlocks - before:  30316 - after:  25419
ChemBridgeBuildingBlocks - before:  17986 - after:  14879
ApexmolBuildingBlocks - before:  35612 - after:  27764
AlindaBuildingBlocks - before:  32397 - after:  26967
ACTChemicalBB - before:  2807 - a

In [11]:
data = load(open(f'{STANDARDIZED_ZINC_BY_NAMES}.pickle', 'rb'))
data

{'ProVenceBuildingBlocks': {'Brc1cnc2c(c(cn2CCCC)C(=O)N)c1',
  'Brc1cnc2c(c1)c(C#N)cn2CCCC',
  'Brc1cnc2cc(n(c2c1)C)C(=O)OC',
  'C(#N)c1cn(c2c1cc(C#N)cn2)CCCC',
  'C(=O)(OCC)c1cnc2c([nH]c(C(=O)[O-])c2)c1',
  'C(=O)(OCC)c1cnc2c([nH]c(C(OC)=O)c2)c1',
  'C(=O)(OCC)c1cnc2c([nH]c(c2)C(=O)N(OC)C)c1',
  'C(=O)(OCC)c1cnc2c([nH]c(c2)C(=O)N)c1',
  'C(=O)(OCC)c1cnc2c([nH]c(c2)C(C)=O)c1',
  'C(=O)(OCC)c1cnc2c([nH]c(c2)C=NO)c1',
  'C(C)CCn1c2c(c(C=O)c1)c(cc(n2)C)C',
  'C(C)CCn1c2c(c(cc(n2)C)C)c(C#N)c1',
  'C(C)[NH+](CC)CCn1cc(C#N)c2c1ncc(C=O)c2',
  'C(C)n1c2c(c(cc(n2)C)C)c(C#N)c1',
  'C(CC)c1c2C(=O)CCCc2nc3c1c(C#N)cn3CC([O-])=O',
  'C(Cn1c2c(c(C(=O)N)c1)cc(C(=O)N)cn2)=C',
  'C(Cn1c2c(cc(C#N)cn2)c(C#N)c1)=C',
  'C(c1c2C(=O)CCCc2nc3c1[nH]c(c3)C(=O)NC)CC',
  'C(c1c2C(=O)CCCc2nc3c1[nH]c(c3)C(=O)OC)CC',
  'C(c1c2C(=O)CCc2nc3c1[nH]c(c3)C(=O)OC)CC',
  'C(c1c2n(C)c(cc2nc3CCCC(=O)c13)C(=O)OC)CC',
  'C(c1c2n(c(cc2nc3CCCC(=O)c13)C(=O)N)C)CC',
  'C1CC(CCC1)n2c3c(c(C)c4C(=O)CCCc4n3)c(C#N)c2',
  'C1CC(CCC1)n2c3c

#### Create .smi file with all unique building blocks

In [12]:
all_zinc_unique = set(y for x in data.values() for y in x)
with open(STANDARDIZED_ALL, 'w') as f:
    for smi in all_zinc_unique:
        f.write(f'{smi}\n')

In [13]:
len(all_zinc_unique)

604785