In [1]:
import pandas as pd
import numpy as np
from src.processer import fragmentSmiles, preprocessSmiles, applyFncPool

# Zinc data

In [2]:
# this .csv is just one of the elaborated dataset
rawData=np.loadtxt("rawdata/zinc.csv", skiprows=1, dtype=str,comments=None)

In [3]:
rawData

array(['CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1',
       'C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1',
       'N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)cc2)cc1', ...,
       'Cc1ccc(NC(=O)C(=O)N(C)Cc2ccccc2)c(C)c1',
       'Cc1cc(C(=O)Nc2ccc(OCC(N)=O)cc2)c(C)n1C1CC1',
       'O=C(CC(c1ccccc1)c1ccccc1)N1CCN(S(=O)(=O)c2ccccc2[N+](=O)[O-])CC1'],
      dtype='<U120')

In [4]:
prepData = pd.Series ( applyFncPool(rawData, fnc = preprocessSmiles) )

In [5]:
prepData.dropna(inplace=True)
prepData.drop_duplicates(keep='first', inplace=True)

In [6]:
prepData.size

249414

In [7]:
zincT = pd.Series ( applyFncPool(prepData, fnc = fragmentSmiles), index = prepData.index )

In [8]:
zincT.name = 'tokens'
prepData.name = 'smiles'

In [9]:
zincT_full = pd.concat([prepData, zincT], axis=1)

In [10]:
zincT_full['length'] = zincT_full['tokens'].apply(lambda x: len(x.split(' ')))

In [11]:
## filtering molecule composed by only 1 fragment (tokens length == 1)
zincT_full = zincT_full.query('length > 1')

## GoF tool can dempose by a customize SMARTS for cleavage pattern. Here, the rotatable pattern is adopted

In [12]:
from functools import partial

In [13]:
from chemicalgof import Smiles2GoF
smartsRotatable="[!$(*#*)&!D1]-!@[!$(*#*)&!D1]"
decomposeSmilesByRotatable = partial(Smiles2GoF, pattBonds = smartsRotatable)
fragmentSmilesByRotatable = partial(fragmentSmiles, fncDecompose = decomposeSmilesByRotatable)

In [14]:
zincRotatable = pd.Series ( applyFncPool(prepData, fnc = fragmentSmilesByRotatable), index = prepData.index )

In [15]:
zincRotatable.name = 'tokens'

In [16]:
zincRotatable_full = pd.concat([prepData, zincRotatable], axis=1)

In [17]:
zincRotatable_full['length'] = zincRotatable_full['tokens'].apply(lambda x: len(x.split(' ')))

In [18]:
## filtering molecules composed by only 1 fragment (tokens length == 1)
zincRotatable_full = zincRotatable_full.query('length > 1')

In [19]:
zincRotatable_full

Unnamed: 0,smiles,tokens,length
0,CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1,CC(C)C <1> <0> c1ccc2occc2c1 <6> C C=O N <2> F...,10
1,C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1,Cn1cnnc1 <2> <5> c1ccncc1 <1> N <3> CC1CCCC(C)...,8
2,N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)...,C1CCNC1 <3> C=O C|R ( <0> c1ccccc1 ) O <3> c1c...,14
3,CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c...,CC <0> O C=O <1R> C1CCNCC1 <3> C=O <0> c1ncn2c...,13
4,N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C(O)[C@H](C#N)C...,Clc1ccccc1 <3> N C=O C S <3> N#CC1=CN=C(O)C(C#...,8
...,...,...,...
249451,CC1(C)CC[C@H](CNC(=O)Cn2ncc3ccccc3c2=O)c2ccccc21,CC1(C)CCCc2ccccc21 <5S> C N C=O C <2> O=c1[nH]...,8
249452,Cn1ccnc1C(=O)c1ccc(NC(=O)C2CCN(C(=O)C(C)(C)C)C...,CC(C)C <1> C=O <3> C1CCNCC1 <0> C=O N <3> c1cc...,14
249453,Cc1ccc(NC(=O)C(=O)N(C)Cc2ccccc2)c(C)c1,c1ccccc1 <0> C <1> CN <1> C=O C=O N <4> Cc1ccc...,11
249454,Cc1cc(C(=O)Nc2ccc(OCC(N)=O)cc2)c(C)n1C1CC1,NC=O <1> C O <3> c1ccccc1 <0> N C=O <3> Cc1ccc...,14
