In [22]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [23]:
import sys
sys.path.append('../DLEPS')
# import molecule_vae
from rdkit.Chem import MolFromSmiles, MolToSmiles
from rdkit.Chem import Draw

import numpy as np  
import pandas as pd

In [24]:
# Load ZINC FDA dataset
df_zinc_fda =  pd.read_csv("../../data/fda_zinc.csv")
df_zinc_fda.head()

Unnamed: 0,zinc_id,smiles
0,ZINC000001530427,C[C@@H]1O[C@@H]1P(=O)(O)O
1,ZINC000003807804,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1
2,ZINC000000120286,Nc1nc(N)c2nc(-c3ccccc3)c(N)nc2n1
3,ZINC000242548690,C[C@H]1O[C@@H](O[C@H]2[C@@H](O)C[C@H](O[C@H]3[...
4,ZINC000000008492,Oc1cccc2cccnc12


In [25]:
smiles = df_zinc_fda.smiles
smiles.head()

0                            C[C@@H]1O[C@@H]1P(=O)(O)O
1               Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1
2                     Nc1nc(N)c2nc(-c3ccccc3)c(N)nc2n1
3    C[C@H]1O[C@@H](O[C@H]2[C@@H](O)C[C@H](O[C@H]3[...
4                                      Oc1cccc2cccnc12
Name: smiles, dtype: object

In [26]:
print(smiles[2])
MolToSmiles(MolFromSmiles(smiles[2]))

Nc1nc(N)c2nc(-c3ccccc3)c(N)nc2n1


'Nc1nc(N)c2nc(-c3ccccc3)c(N)nc2n1'

In [27]:
#smiles = smiles[:100]
smiles_rdkit = []
iid = []
bad_iid = []
for i in range(len(smiles)):
    try:
        smiles_rdkit.append(MolToSmiles(MolFromSmiles(smiles[ i ])))
        iid.append(i)
    except:
        bad_iid.append(i)
        print("Error at %d" % (i))

In [28]:
print(len(smiles))
print(len(iid))

1615
1615


In [29]:
from functools import reduce

def xlength(y):
    return reduce(lambda sum, element: sum + 1, y, 0)

In [30]:
def get_zinc_tokenizer(cfg):
    long_tokens = [a for a in list(cfg._lexical_index.keys()) if xlength(a) > 1] ####
    replacements = ['$','%','^'] # ,'&']
    assert xlength(long_tokens) == len(replacements) ####xzw
    for token in replacements: 
        assert token not in cfg._lexical_index ####
    
    def tokenize(smiles):
        for i, token in enumerate(long_tokens):
            smiles = smiles.replace(token, replacements[i])
        tokens = []
        for token in smiles:
            try:
                ix = replacements.index(token)
                tokens.append(long_tokens[ix])
            except:
                tokens.append(token)
        return tokens
    
    return tokenize

In [31]:
import zinc_grammar
import nltk

_tokenize = get_zinc_tokenizer(zinc_grammar.GCFG)
_parser = nltk.ChartParser(zinc_grammar.GCFG)
_productions = zinc_grammar.GCFG.productions()
_prod_map = {}
for ix, prod in enumerate(_productions):
    _prod_map[prod] = ix
MAX_LEN = 277
_n_chars = len(_productions)

In [32]:
_parser

<nltk.parse.chart.ChartParser at 0x1f672c432c8>

In [33]:
""" Encode a list of smiles strings into the latent space """
assert type(smiles_rdkit) == list
tokens = map(_tokenize, smiles_rdkit)
parse_trees = []
i = 0
badi = []
for t in tokens:
    #while True:
    try:
        tp = next(_parser.parse(t))
        parse_trees.append(tp)
    except:
        print("Parse tree error at %d" % i)
        badi.append(i)
    i += 1
    #print(i)
productions_seq = [tree.productions() for tree in parse_trees]
indices = [np.array([_prod_map[prod] for prod in entry], dtype=int) for entry in productions_seq]
one_hot = np.zeros((len(indices), MAX_LEN, _n_chars), dtype=np.float32)
for i in range(len(indices)):
    num_productions = len(indices[i])
    if num_productions > MAX_LEN:
        print("Too large molecules, out of range")
    #print("i=  {%d} len(indices)=  {%d} num_productions = %d " % (i,len(indices),num_productions))
        one_hot[i][np.arange(MAX_LEN),indices[i][:MAX_LEN]] = 1.
    else:    
        one_hot[i][np.arange(num_productions),indices[i]] = 1.
        one_hot[i][np.arange(num_productions, MAX_LEN),-1] = 1.

Parse tree error at 117
Parse tree error at 128
Parse tree error at 1110
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large molecules, out of range
Too large

In [34]:
print(len(df_zinc_fda), len(one_hot))
print(len(bad_iid), len(badi))
df_zinc_fda = df_zinc_fda.drop(df_zinc_fda.iloc[bad_iid].index)
df_zinc_fda = df_zinc_fda.drop(df_zinc_fda.iloc[badi].index)
print(len(df_zinc_fda))
df_zinc_fda
# ydata =  df_merged_final["MDA7+D1+Mean_norm"].values

1615 1612
0 3
1612


Unnamed: 0,zinc_id,smiles
0,ZINC000001530427,C[C@@H]1O[C@@H]1P(=O)(O)O
1,ZINC000003807804,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1
2,ZINC000000120286,Nc1nc(N)c2nc(-c3ccccc3)c(N)nc2n1
3,ZINC000242548690,C[C@H]1O[C@@H](O[C@H]2[C@@H](O)C[C@H](O[C@H]3[...
4,ZINC000000008492,Oc1cccc2cccnc12
...,...,...
1610,ZINC000022010387,C[C@H]1[C@H](c2ccccc2)OCCN1C
1611,ZINC000022448097,CCOc1ccc(C[C@@H](CN(CCN(CC(=O)O)CC(=O)O)CC(=O)...
1612,ZINC000100370145,CC1(C)[C@H]2CC[C@@]1(CS(=O)(=O)O)C(=O)/C2=C/c1...
1613,ZINC000059111167,[C-]#[N+]CC(C)(C)OC


In [35]:
df_zinc_fda.to_csv('../../data/fda_zinc_final.csv', index=False)

In [14]:
one_hot.shape

(1612, 277, 76)

In [15]:
perm = np.arange(len(df_zinc_fda))
np.random.shuffle(perm)
# appostasis = ydata[perm]
smiles_zinc_fda = one_hot[perm]
# TEST_SIZE = 100
# appostasis_train = appostasis[TEST_SIZE:]
# smile_train = smile_rd[TEST_SIZE:]
# appostasis_test = appostasis[:TEST_SIZE]
# smile_test = smile_rd[:TEST_SIZE]

In [17]:
smiles_zinc_fda.shape

(1612, 277, 76)

In [16]:
import h5py

h5f = h5py.File('../../data/SMILES_zinc_fda.h5', 'w')
h5f.create_dataset('data', data=smiles_zinc_fda)
h5f.close()

# h5f = h5py.File('../../data/appostasis_train_demo2.h5', 'w')
# h5f.create_dataset('data', data=appostasis_train)
# h5f.close()

# h5f = h5py.File('../../data/SMILE_test_demo2.h5', 'w')
# h5f.create_dataset('data', data=smile_test)
# h5f.close()

# h5f = h5py.File('../../data/appostasis_test_demo2.h5', 'w')
# h5f.create_dataset('data', data=appostasis_test)
# h5f.close()