In [1]:
import pandas as pd, numpy as np
from rdkit import Chem

In [2]:
from scipy import stats

In [3]:
import pandas as pd, numpy as np, re, os
from src.evaluation import Evaluator, compilerPatterns, ResultsFromMeanStdDF
from matplotlib import pyplot as plt
plt.rcParams.update({'font.size': 12})

In [4]:
## This cell would need for testing phase - Ignore this
%load_ext autoreload
%autoreload 2

In [5]:
paths = [ os.path.join(dataset.name, experiment.name) 
         for dataset in os.scandir('experiments') if dataset.is_dir() and 'chembl' in dataset.name 
         for experiment in os.scandir(dataset.path)  ]

In [6]:
paths=[]
for dataset in os.scandir('experiments'):
    if not dataset.is_dir() or not 'chembl' in dataset.name:
        continue

    if 'chemblAug5' in dataset.name:
        for setup in os.scandir(dataset):
            if '2hl_512hu' in setup.name and '256bs' in setup.name:
                paths.append(setup.path)

    elif 'chembl' in dataset.name:
        for setup in os.scandir(dataset):
            if '2hl_512hu' in setup.name and '512bs' in setup.name:
                paths.append(setup.path)

In [7]:
paths

['experiments/chemblAug5_SELFIES-RNN/2hl_512hu_256bs_0.001lr',
 'experiments/chemblAug5_SMILES-RNN/2hl_512hu_256bs_0.001lr',
 'experiments/chemblAug5_fragSMILES-RNN/2hl_512hu_256bs_300es_0.001lr',
 'experiments/chembl_SELFIES-RNN/2hl_512hu_512bs_0.001lr',
 'experiments/chembl_SMILES-RNN/2hl_512hu_512bs_0.001lr',
 'experiments/chembl_fragSMILES-RNN/2hl_512hu_512bs_300es_0.001lr',
 'experiments/chemblAug5_t-SMILES-RNN/2hl_512hu_256bs_0.001lr',
 'experiments/chembl_t-SMILES-RNN/2hl_512hu_512bs_0.001lr']

In [8]:
evaluators = [Evaluator(path) for path in paths]

In [9]:
evaluators_table = pd.concat([evaluator.get_as_DFcell() for evaluator in evaluators ], ignore_index=True, axis=0)

In [10]:
evaluators_table = evaluators_table.drop(columns=['dataset','notation']).groupby(['aug','hl','hu','bs','lr']).agg(lambda x: x.dropna())

In [11]:
evaluators_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,SELFIES,SMILES,fragSMILES,t-SMILES
aug,hl,hu,bs,lr,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2,512,512,0.001,<src.evaluation.Evaluator object at 0x7faf13db...,<src.evaluation.Evaluator object at 0x7faf13db...,<src.evaluation.Evaluator object at 0x7faf11b8...,<src.evaluation.Evaluator object at 0x7faf11b8...
5,2,512,256,0.001,<src.evaluation.Evaluator object at 0x7faf152b...,<src.evaluation.Evaluator object at 0x7faf1528...,<src.evaluation.Evaluator object at 0x7fafc6f3...,<src.evaluation.Evaluator object at 0x7faf11b8...


In [12]:
evaluators_table.map(lambda x: x.load_novels())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,SELFIES,SMILES,fragSMILES,t-SMILES
aug,hl,hu,bs,lr,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2,512,512,0.001,True,True,True,True
5,2,512,256,0.001,True,True,True,True


In [13]:
evaluators_table.map(lambda x: x.load_scaffolds())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,SELFIES,SMILES,fragSMILES,t-SMILES
aug,hl,hu,bs,lr,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2,512,512,0.001,True,True,True,True
5,2,512,256,0.001,True,True,True,True


## scaffolds analysis

In [14]:
scaffDFs=pd.concat( [evaluator.getScaffoldResults() for evaluator in evaluators if not evaluator.novels is None], ignore_index=True)

[21:56:42] non-ring atom 1 marked aromatic


In [15]:
scaffDFs['unique%']=scaffDFs['unique']/scaffDFs['total']
scaffDFs['novel%']=scaffDFs['novel']/scaffDFs['unique']
scaffDFs['chiral%']=scaffDFs['chiral']/scaffDFs['novel']

In [16]:
scaffGrp=scaffDFs.drop(columns=['fold','epoch','T','amount','dataset']).groupby(['aug','notation','hl','hu','bs','lr'],as_index=True, dropna=False)

In [17]:
scaffStats=scaffGrp.describe().loc(axis=1)[ ( slice(None),['mean','std'] ) ]

In [18]:
scaffStats=ResultsFromMeanStdDF(scaffStats)

In [19]:
scaffStats

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,total,unique,novel,chiral,unique%,novel%,chiral%
aug,notation,hl,hu,bs,lr,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,SELFIES,2,512,512,0.001,5964.0 ± 6.0,5570.0 ± 40.0,5160.0 ± 70.0,760.0 ± 40.0,0.934 ± 0.007,0.925 ± 0.006,0.148 ± 0.008
1,SMILES,2,512,512,0.001,5976.0 ± 8.0,5490.0 ± 80.0,4600.0 ± 100.0,820.0 ± 40.0,0.92 ± 0.01,0.838 ± 0.009,0.178 ± 0.007
1,fragSMILES,2,512,512,0.001,5985.0 ± 8.0,5500.0 ± 50.0,4440.0 ± 70.0,750.0 ± 50.0,0.918 ± 0.007,0.809 ± 0.007,0.169 ± 0.009
1,t-SMILES,2,512,512,0.001,5908.0 ± 9.0,5330.0 ± 50.0,4730.0 ± 40.0,760.0 ± 60.0,0.902 ± 0.007,0.888 ± 0.004,0.16 ± 0.01
5,SELFIES,2,512,256,0.001,5978.0 ± 6.0,5620.0 ± 40.0,5240.0 ± 60.0,870.0 ± 60.0,0.941 ± 0.006,0.931 ± 0.004,0.17 ± 0.01
5,SMILES,2,512,256,0.001,5976.0 ± 5.0,5610.0 ± 70.0,5000.0 ± 100.0,1000.0 ± 100.0,0.94 ± 0.01,0.9 ± 0.01,0.19 ± 0.02
5,fragSMILES,2,512,256,0.001,5978.0 ± 7.0,5510.0 ± 80.0,4400.0 ± 100.0,830.0 ± 80.0,0.92 ± 0.01,0.81 ± 0.01,0.19 ± 0.02
5,t-SMILES,2,512,256,0.001,5925.0 ± 10.0,5380.0 ± 40.0,4730.0 ± 40.0,810.0 ± 50.0,0.908 ± 0.008,0.879 ± 0.005,0.17 ± 0.01


In [20]:
scaffStats.to_csv('results/08_grisoniScaffMetrics.csv')

In [21]:
params=[2,512,512,0.001]
params5=[2,512,256,0.001]

In [22]:
genPvals = pd.DataFrame.from_dict(
                {notation+'x1': 
                {metric:stats.ttest_ind(
                scaffGrp.get_group((1,notation,*params))[metric],
                scaffGrp.get_group((1,'fragSMILES',*params))[metric],
                ).pvalue for metric in scaffStats.columns} for notation in ['SMILES','SELFIES','t-SMILES']
                } |
                {notation+'x5': 
                {metric:stats.ttest_ind(
                scaffGrp.get_group((5,notation,*params5))[metric],
                scaffGrp.get_group((5,'fragSMILES',*params5))[metric],
                ).pvalue for metric in scaffStats.columns} for notation in ['SMILES','SELFIES','t-SMILES']
                }
                , orient='index'
            )

In [23]:
genPvals

Unnamed: 0,total,unique,novel,chiral,unique%,novel%,chiral%
SMILESx1,0.122287,0.883691,0.02905847,0.042347,0.966772,0.0003886867,0.125945
SELFIESx1,0.001574,0.03405,2.212652e-07,0.64211,0.007221,3.270683e-09,0.005719
SMILESx5,0.544191,0.070134,0.0001120501,0.116053,0.064472,4.87812e-06,0.846064
SELFIESx5,1.0,0.017568,2.491563e-06,0.417392,0.017827,9.042331e-08,0.042876


In [24]:
genPvals.map(lambda x: x<0.05)

Unnamed: 0,total,unique,novel,chiral,unique%,novel%,chiral%
SMILESx1,False,False,True,True,False,True,False
SELFIESx1,True,True,True,False,True,True,True
SMILESx5,False,False,True,False,False,True,False
SELFIESx5,False,True,True,False,True,True,True


## Count how many scaffolds are novel cause of new cyclic fragments

In [25]:
from src.processer import fragmentSmiles, applyFncPool

In [26]:
vocabs=[]
train_data = pd.read_csv(f'data/chembl.tar.xz', usecols = ['fragsmiles'] + [f'fold{fold}' for fold in range(5)], compression="xz")
train_data['fragsmiles'] = train_data['fragsmiles'].str.split(' ')
for fold in range(5):
    query = f'fold{fold} == "train"'
    fold_data = train_data.query(query)['fragsmiles']
    vocabs.append(np.unique(np.concatenate(fold_data.values)))

In [27]:
def checkScaffoldFragments(scaffolds, vocab):

    def checkNewFrag(frags):
        mask = np.in1d(frags, vocab)

        ## If u want track which new fragments are
        # new = np.array(frags)[~mask]
        # if len(new)>0:
        #     print(new)
        return ~mask.any()
    
    # try:
    scaffFrags = pd.Series( applyFncPool(scaffolds, fnc=fragmentSmiles) ).str.split(' ')
    maskNewFrags=scaffFrags.apply(checkNewFrag)

    # return maskNewFrags
    return maskNewFrags.sum()


In [28]:
def countNewScaffCauseFrags(evaluator):
    results = evaluator.novels.copy()

    results['inducedNewFrags']=results.apply(lambda x: checkScaffoldFragments(x.sampled.novelScaff, vocabs[x.fold]), axis=1)
    # return results.apply(lambda x: checkScaffoldFragments(x.sampled.novelScaff, vocabs[x.fold]), axis=1)

    setup = {k:v for k,v in evaluator.setupArgs.items() if k!='es'}
    results = results.assign(**evaluator.datasetArgs, **setup)

    results.drop(columns='sampled', inplace=True)
    return results.sort_values(['amount','fold','epoch'])

In [29]:
fragInduced = pd.concat( [ countNewScaffCauseFrags(evaluator) for evaluator in evaluators if evaluator.datasetArgs['notation']!='fragSMILES'], ignore_index=True)

In [30]:
inducedNewFrags=pd.concat( [ countNewScaffCauseFrags(evaluator) for evaluator in evaluators], ignore_index=True)

In [31]:
inducedNewFrags.to_csv('checkHere.csv', index=False)

In [32]:
inducedNewFrags

Unnamed: 0,fold,amount,epoch,T,inducedNewFrags,dataset,aug,notation,hl,hu,bs,lr
0,0,6000,3,1.0,178,chembl,5,SELFIES,2,512,256,0.001
1,1,6000,3,1.0,169,chembl,5,SELFIES,2,512,256,0.001
2,2,6000,3,1.0,178,chembl,5,SELFIES,2,512,256,0.001
3,3,6000,3,1.0,205,chembl,5,SELFIES,2,512,256,0.001
4,4,6000,3,1.0,175,chembl,5,SELFIES,2,512,256,0.001
5,0,6000,3,1.0,139,chembl,5,SMILES,2,512,256,0.001
6,1,6000,3,1.0,121,chembl,5,SMILES,2,512,256,0.001
7,2,6000,3,1.0,94,chembl,5,SMILES,2,512,256,0.001
8,3,6000,3,1.0,118,chembl,5,SMILES,2,512,256,0.001
9,4,6000,3,1.0,61,chembl,5,SMILES,2,512,256,0.001


In [33]:
scaffDFs

Unnamed: 0,fold,amount,epoch,T,total,unique,novel,chiral,dataset,aug,notation,hl,hu,bs,lr,unique%,novel%,chiral%
0,0,6000,3,1.0,5972,5675,5316,917,chembl,5,SELFIES,2,512,256,0.001,0.950268,0.93674,0.172498
1,1,6000,3,1.0,5974,5598,5214,874,chembl,5,SELFIES,2,512,256,0.001,0.937061,0.931404,0.167626
2,2,6000,3,1.0,5987,5614,5206,924,chembl,5,SELFIES,2,512,256,0.001,0.937698,0.927325,0.177488
3,3,6000,3,1.0,5981,5651,5279,861,chembl,5,SELFIES,2,512,256,0.001,0.944825,0.934171,0.163099
4,4,6000,3,1.0,5976,5585,5178,775,chembl,5,SELFIES,2,512,256,0.001,0.934572,0.927126,0.149672
5,0,6000,3,1.0,5975,5513,4840,845,chembl,5,SMILES,2,512,256,0.001,0.922678,0.877925,0.174587
6,1,6000,3,1.0,5968,5600,5020,874,chembl,5,SMILES,2,512,256,0.001,0.938338,0.896429,0.174104
7,2,6000,3,1.0,5979,5634,5090,866,chembl,5,SMILES,2,512,256,0.001,0.942298,0.903443,0.170138
8,3,6000,3,1.0,5975,5698,5174,1145,chembl,5,SMILES,2,512,256,0.001,0.95364,0.908038,0.221299
9,4,6000,3,1.0,5981,5588,5011,1041,chembl,5,SMILES,2,512,256,0.001,0.934292,0.896743,0.207743


In [34]:
induced = scaffDFs.merge(inducedNewFrags)

In [35]:
induced

Unnamed: 0,fold,amount,epoch,T,total,unique,novel,chiral,dataset,aug,notation,hl,hu,bs,lr,unique%,novel%,chiral%,inducedNewFrags
0,0,6000,3,1.0,5972,5675,5316,917,chembl,5,SELFIES,2,512,256,0.001,0.950268,0.93674,0.172498,178
1,1,6000,3,1.0,5974,5598,5214,874,chembl,5,SELFIES,2,512,256,0.001,0.937061,0.931404,0.167626,169
2,2,6000,3,1.0,5987,5614,5206,924,chembl,5,SELFIES,2,512,256,0.001,0.937698,0.927325,0.177488,178
3,3,6000,3,1.0,5981,5651,5279,861,chembl,5,SELFIES,2,512,256,0.001,0.944825,0.934171,0.163099,205
4,4,6000,3,1.0,5976,5585,5178,775,chembl,5,SELFIES,2,512,256,0.001,0.934572,0.927126,0.149672,175
5,0,6000,3,1.0,5975,5513,4840,845,chembl,5,SMILES,2,512,256,0.001,0.922678,0.877925,0.174587,139
6,1,6000,3,1.0,5968,5600,5020,874,chembl,5,SMILES,2,512,256,0.001,0.938338,0.896429,0.174104,121
7,2,6000,3,1.0,5979,5634,5090,866,chembl,5,SMILES,2,512,256,0.001,0.942298,0.903443,0.170138,94
8,3,6000,3,1.0,5975,5698,5174,1145,chembl,5,SMILES,2,512,256,0.001,0.95364,0.908038,0.221299,118
9,4,6000,3,1.0,5981,5588,5011,1041,chembl,5,SMILES,2,512,256,0.001,0.934292,0.896743,0.207743,61


In [36]:
induced['inducedNewFrags%']=induced['inducedNewFrags']/induced['novel']

In [37]:
inducedGrp=induced.drop(columns=['dataset','T','fold','amount','epoch']).groupby(['aug','notation','hl','hu','bs','lr'],as_index=True, dropna=False)

In [38]:
inducedStats=inducedGrp.describe().loc(axis=1)[ ( slice(None),['mean','std'] ) ]

In [39]:
inducedStats=ResultsFromMeanStdDF(inducedStats)

In [40]:
inducedStats.to_csv('checkHere.csv', index=False)