In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [2]:
import sys
sys.path.append('../DLEPS')
# import molecule_vae
from rdkit.Chem import MolFromSmiles, MolToSmiles
from rdkit.Chem import Draw

import numpy as np  
import pandas as pd

In [4]:
# Load ZINC world dataset
df_zinc_world =  pd.read_csv("../../data/WORLD_test.csv")
df_zinc_world.head()

Unnamed: 0,zinc_id,smiles,smiles_rdkit
0,ZINC000095618893,CN[C@H]1CC[C@H]2Nc3c(O)cc(C(N)=O)cc3[C@@H]2C1,CN[C@H]1CC[C@H]2Nc3c(O)cc(C(N)=O)cc3[C@@H]2C1
1,ZINC000095618901,C[C@@H]1CN(S(=O)(=O)O)CCc2ccc(Cl)cc21,C[C@@H]1CN(S(=O)(=O)O)CCc2ccc(Cl)cc21
2,ZINC000095618909,CC(=O)N[C@H]1CC[C@H]2Nc3ccc(C(N)=O)cc3[C@@H]2C1,CC(=O)N[C@H]1CC[C@H]2Nc3ccc(C(N)=O)cc3[C@@H]2C1
3,ZINC000095618925,O=C(O)[C@@H]1O[C@H](O[n+]2ccccc2)[C@@H](O)[C@H...,O=C(O)[C@@H]1O[C@H](O[n+]2ccccc2)[C@@H](O)[C@H...
4,ZINC000000000513,COc1ccccc1OC[C@H](O)CNC(C)C,COc1ccccc1OC[C@H](O)CNC(C)C


In [5]:
smiles = df_zinc_world.smiles
smiles.head()

0        CN[C@H]1CC[C@H]2Nc3c(O)cc(C(N)=O)cc3[C@@H]2C1
1                C[C@@H]1CN(S(=O)(=O)O)CCc2ccc(Cl)cc21
2      CC(=O)N[C@H]1CC[C@H]2Nc3ccc(C(N)=O)cc3[C@@H]2C1
3    O=C(O)[C@@H]1O[C@H](O[n+]2ccccc2)[C@@H](O)[C@H...
4                          COc1ccccc1OC[C@H](O)CNC(C)C
Name: smiles, dtype: object

In [6]:
print(smiles[2])
MolToSmiles(MolFromSmiles(smiles[2]))

CC(=O)N[C@H]1CC[C@H]2Nc3ccc(C(N)=O)cc3[C@@H]2C1


'CC(=O)N[C@H]1CC[C@H]2Nc3ccc(C(N)=O)cc3[C@@H]2C1'

In [7]:
#smiles = smiles[:100]
smiles_rdkit = []
iid = []
bad_iid = []
for i in range(len(smiles)):
    try:
        smiles_rdkit.append(MolToSmiles(MolFromSmiles(smiles[ i ])))
        iid.append(i)
    except:
        bad_iid.append(i)
        print("Error at %d" % (i))

In [8]:
print(len(smiles))
print(len(iid))

4157
4157


In [9]:
from functools import reduce

def xlength(y):
    return reduce(lambda sum, element: sum + 1, y, 0)

In [10]:
def get_zinc_tokenizer(cfg):
    long_tokens = [a for a in list(cfg._lexical_index.keys()) if xlength(a) > 1] ####
    replacements = ['$','%','^'] # ,'&']
    assert xlength(long_tokens) == len(replacements) ####xzw
    for token in replacements: 
        assert token not in cfg._lexical_index ####
    
    def tokenize(smiles):
        for i, token in enumerate(long_tokens):
            smiles = smiles.replace(token, replacements[i])
        tokens = []
        for token in smiles:
            try:
                ix = replacements.index(token)
                tokens.append(long_tokens[ix])
            except:
                tokens.append(token)
        return tokens
    
    return tokenize

In [12]:
import zinc_grammar
import nltk

_tokenize = get_zinc_tokenizer(zinc_grammar.GCFG)
_parser = nltk.ChartParser(zinc_grammar.GCFG)
_productions = zinc_grammar.GCFG.productions()
_prod_map = {}
for ix, prod in enumerate(_productions):
    _prod_map[prod] = ix
MAX_LEN = 277
_n_chars = len(_productions)

In [13]:
_parser

<nltk.parse.chart.ChartParser at 0x20c9edb43c8>

In [14]:
""" Encode a list of smiles strings into the latent space """
assert type(smiles_rdkit) == list
tokens = map(_tokenize, smiles_rdkit)
parse_trees = []
i = 0
badi = []
for t in tokens:
    #while True:
    try:
        tp = next(_parser.parse(t))
        parse_trees.append(tp)
    except:
        print("Parse tree error at %d" % i)
        badi.append(i)
    i += 1
    #print(i)
productions_seq = [tree.productions() for tree in parse_trees]
indices = [np.array([_prod_map[prod] for prod in entry], dtype=int) for entry in productions_seq]
one_hot = np.zeros((len(indices), MAX_LEN, _n_chars), dtype=np.float32)
for i in range(len(indices)):
    num_productions = len(indices[i])
    if num_productions > MAX_LEN:
        print("Too large molecules, out of range")
    #print("i=  {%d} len(indices)=  {%d} num_productions = %d " % (i,len(indices),num_productions))
        one_hot[i][np.arange(MAX_LEN),indices[i][:MAX_LEN]] = 1.
    else:    
        one_hot[i][np.arange(num_productions),indices[i]] = 1.
        one_hot[i][np.arange(num_productions, MAX_LEN),-1] = 1.

Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large mole

In [15]:
print(len(df_zinc_world), len(one_hot))
print(len(bad_iid), len(badi))
df_zinc_world = df_zinc_world.drop(df_zinc_world.iloc[bad_iid].index)
df_zinc_world = df_zinc_world.drop(df_zinc_world.iloc[badi].index)
print(len(df_zinc_world))
df_zinc_world
# ydata =  df_merged_final["MDA7+D1+Mean_norm"].values

4157 4157
0 0
4157


Unnamed: 0,zinc_id,smiles,smiles_rdkit
0,ZINC000095618893,CN[C@H]1CC[C@H]2Nc3c(O)cc(C(N)=O)cc3[C@@H]2C1,CN[C@H]1CC[C@H]2Nc3c(O)cc(C(N)=O)cc3[C@@H]2C1
1,ZINC000095618901,C[C@@H]1CN(S(=O)(=O)O)CCc2ccc(Cl)cc21,C[C@@H]1CN(S(=O)(=O)O)CCc2ccc(Cl)cc21
2,ZINC000095618909,CC(=O)N[C@H]1CC[C@H]2Nc3ccc(C(N)=O)cc3[C@@H]2C1,CC(=O)N[C@H]1CC[C@H]2Nc3ccc(C(N)=O)cc3[C@@H]2C1
3,ZINC000095618925,O=C(O)[C@@H]1O[C@H](O[n+]2ccccc2)[C@@H](O)[C@H...,O=C(O)[C@@H]1O[C@H](O[n+]2ccccc2)[C@@H](O)[C@H...
4,ZINC000000000513,COc1ccccc1OC[C@H](O)CNC(C)C,COc1ccccc1OC[C@H](O)CNC(C)C
...,...,...,...
4152,ZINC000101144764,C[N+]1(C)[C@@H]2C[C@H](OC(=O)C(O)(c3cccs3)c3cc...,C[N+]1(C)[C@@H]2C[C@H](OC(=O)C(O)(c3cccs3)c3cc...
4153,ZINC000118913164,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=C(CCC(=O)C4)[C@...,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=C(CCC(=O)C4)[C@...
4154,ZINC000085537078,CC[C@]12C=CCN3CC[C@@]4(c5cc([C@@]6(C(=O)OC)C[C...,CC[C@]12C=CCN3CC[C@@]4(c5cc([C@@]6(C(=O)OC)C[C...
4155,ZINC000085537089,CCCN(CCC)C(=O)[C@@H](CCC(=O)OCCCN1CCN(CCOC(=O)...,CCCN(CCC)C(=O)[C@@H](CCC(=O)OCCCN1CCN(CCOC(=O)...


In [15]:
# df_zinc_world.to_csv('../../data/world_zinc_final.csv', index=False)

In [16]:
one_hot.shape

(4157, 277, 76)

In [17]:
perm = np.arange(len(df_zinc_world))
np.random.shuffle(perm)
# appostasis = ydata[perm]
smiles_zinc_world = one_hot[perm]
# TEST_SIZE = 100
# appostasis_train = appostasis[TEST_SIZE:]
# smile_train = smile_rd[TEST_SIZE:]
# appostasis_test = appostasis[:TEST_SIZE]
# smile_test = smile_rd[:TEST_SIZE]

In [18]:
smiles_zinc_world.shape

(4157, 277, 76)

In [19]:
import h5py

h5f = h5py.File('../../data/SMILES_zinc_world.h5', 'w')
h5f.create_dataset('data', data=smiles_zinc_world)
h5f.close()

# h5f = h5py.File('../../data/appostasis_train_demo2.h5', 'w')
# h5f.create_dataset('data', data=appostasis_train)
# h5f.close()

# h5f = h5py.File('../../data/SMILE_test_demo2.h5', 'w')
# h5f.create_dataset('data', data=smile_test)
# h5f.close()

# h5f = h5py.File('../../data/appostasis_test_demo2.h5', 'w')
# h5f.create_dataset('data', data=appostasis_test)
# h5f.close()