In [1]:
import pandas as pd
import numpy as np
from src.processer import fragmentSmiles, getSelfiesToks, applyFncPool, getTsmiles
import os

## dataset Moses Tokenization

In [2]:
pathData = "data/"

In [3]:
mosesData = pd.read_csv("rawdata/moses.txt",usecols=['SMILES']).squeeze()

In [4]:
mosesData.size

1936962

In [5]:
mosesDataToks = mosesData.to_frame(name='smiles')

In [6]:
mosesDataToks['tsmiles'] = applyFncPool(mosesData, fnc = getTsmiles)

In [7]:
mosesDataToks['fragsmiles'] = applyFncPool( mosesData, fnc = fragmentSmiles )

In [8]:
mosesDataToks['selfies']=applyFncPool(mosesData, fnc = getSelfiesToks)

In [9]:
mosesDataToks

Unnamed: 0,smiles,tsmiles,fragsmiles,selfies
0,CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1,*C&*C*&*C*&*S(*)=O&*C1=CC=C2NC(=NC(=O)O*)NC2=C...,C C C O=[SH2] <6> N=c1[nH]c2ccccc2[nH]1 <0> C=...,[C] [C] [C] [S] [=Branch1] [C] [=O] [C] [=C] [...
1,CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1,*C&*C(*)(C)C&*C(*)=O&*C(*)*&*O*&*C1=CC=C(*)C=C...,C C ( C ) ( C ) C=O C ( <2> c1c[nH]cn1 ) O <0>...,[C] [C] [Branch1] [C] [C] [Branch1] [C] [C] [C...
2,CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1,*C&*C1C2CCC(C2)C1*&*C*&*N(*)C(*)=O&*C*&*C*^*C1...,O C C N ( C=O <0> c1ccccc1 <3> Cl ) C <3> C1CC...,[C] [C] [C] [C] [C] [C] [Branch1] [Ring2] [C] ...
3,Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO,*C&*C1=C(*)C=CC=C1*&*Cl&&*NC1=NC=CC=C1*^*C(=O)...,C <5> c1ccccc1 <4> <0> ( Cl ) N <2> c1ccncc1 <...,[C] [C] [=C] [Branch1] [C] [Cl] [C] [=C] [C] [...
4,Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C,*C&*N1C=NC2=C1C(=O)N(*)C(=O)N2*&*C*&*C(*)*^*C^...,C <6> O=c1[nH]c(=O)c2[nH]cnc2[nH]1 <2> <10> ( ...,[C] [N] [C] [=N] [C] [=C] [Ring1] [Branch1] [C...
...,...,...,...,...
1936957,N#Cc1c(Br)cnc(N)c1Br,*C#N&*C1=C(*)C=NC(*)=C1*&*Br&&*N&*Br&&,Br <1> c1ccncc1 <4> <5> ( Br ) <0> ( C#N ) N,[N] [#C] [C] [=C] [Branch1] [C] [Br] [C] [=N] ...
1936958,COC(=O)c1cc(CNC(=O)OC(C)(C)C)ccc1C,*C&*OC(*)=O&*C1=CC(*)=CC=C1*&*C*&*NC(=O)O*^*C^...,C C ( C ) ( C ) O C=O N C <0> c1ccccc1 <4> <3>...,[C] [O] [C] [=Branch1] [C] [=O] [C] [=C] [C] [...
1936959,NC(=O)c1ccc2ccccc2c1Br,*C(N)=O&*C1=CC=C2C=CC=CC2=C1*&*Br&&&,N C=O <0> c1ccc2ccccc2c1 <9> Br,[N] [C] [=Branch1] [C] [=O] [C] [=C] [C] [=C] ...
1936960,CC(=O)Nc1cccc(-c2nc3cc(C)ccc3[nH]c2=O)c1,*C&*C(=O)N*&*C1=CC=CC(*)=C1&*C1=NC2=CC(*)=CC=C...,C C=O N <0> c1ccccc1 <4> <2> O=c1cnc2ccccc2[nH...,[C] [C] [=Branch1] [C] [=O] [N] [C] [=C] [C] [...


## Discarding molecule composed by only 1 fragment in according to GoF cleavage pattern

In [12]:
length_fragsmiles = mosesDataToks['fragsmiles'].apply(lambda x: len(x.split(' ')))

In [13]:
mask_1frag = length_fragsmiles < 2

In [16]:
## 269 are molecules composed by only 1 fragment, so they are not considered as fragSmiles
mask_1frag.sum()

269

In [15]:
mosesDataToks = mosesDataToks[~mask_1frag]

## splitting by 5-fold. Random seed = 0

In [25]:
from sklearn.model_selection import KFold

kf=KFold(n_splits=5, shuffle=True, random_state=0)

indexes = mosesDataToks.index.unique()

In [33]:
def tabular_fold_idxs(indexes, splitting_object):

    df = pd.DataFrame(index=indexes)    

    kf = splitting_object.split( indexes )

    for fold,(trainIdx,validIdx) in enumerate(kf):
        df[f'fold{fold}'] = None
        df.iloc[trainIdx,fold] = 'train'
        df.iloc[validIdx,fold] = 'valid'

    return df

In [34]:
split_df = tabular_fold_idxs(indexes=indexes, splitting_object=kf)

In [37]:
datafile = pd.concat([mosesDataToks, split_df], axis=1, ignore_index=False)

## Prova

In [1]:
import pandas as pd

In [2]:
datafile = pd.read_csv("data/moses.tar.xz", compression="xz", usecols=['smiles', 'fold0'])

In [5]:
datafile.groupby('fold0').get_group('train')

Unnamed: 0,smiles,fold0
1,CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1,train
3,Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO,train
4,Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C,train
5,CC1Oc2ccc(Cl)cc2N(CC(O)CO)C1=O,train
6,O=C(C1CCCCC1)N1CC(=O)N2CCCc3ccccc3C2C1,train
...,...,...
1936687,CCc1nc(N)c(Br)cc1Br,train
1936688,N#Cc1c(Br)cnc(N)c1Br,train
1936690,NC(=O)c1ccc2ccccc2c1Br,train
1936691,CC(=O)Nc1cccc(-c2nc3cc(C)ccc3[nH]c2=O)c1,train
