In [1]:
# Import myfunc at cix folder
%matplotlib inline
import matplotlib.pyplot as plt
import sys
sys.path.insert(0, '../cix')
import myfuncs as mf
import pandas as pd
from rdkit import rdBase
rdBase.DisableLog('rdApp.*') # To make rdkit silent
from rdkit.Chem import PandasTools as pt
from rdkit.Chem import Descriptors
import numpy as np
import time



This is an script to analyze the output of experiment 1. In this experiment the CMD was trained with increasing subsets of the original set of 300K molecules used in the CMD paper. The idea is to figure out how the size of the training set influences the correctness, diversity and novelty of the output generated with the CMD. 
The correctness of the training and output files is assessed by the percentage of correct SMILES.
The diversity of the training and output files are assessed by counting the number of clusters, frames and generic frames in both sets. 
The novelty of the training set is assessed by the percentage of molecules with a Tanimoto similarity < 0.7 to any molecule in the training set, and the percentage of frames or generic frames not present in the training set. 
In this version we analyze the outputs generated when the CMD was set to output 20K molecules instead of 5K. 

In [None]:
##########################################################
## Analysis of the unconditioned output
##########################################################

# Init the time counter for the whole notebook
start = time.time()


it = range(50000, 300001, 50000)

df_un, cls_un = mf.wholean(it = it, name_train = "train", name_pref = "unc20k-")

Clustering time: 00:00:09
Diversity analysis time: 00:02:51
Clustering time: 00:00:03
Diversity analysis time: 00:00:59
Arenas creation time: 00:00:00
Novelty analysis time: 00:04:18
Clustering time: 00:00:25
Diversity analysis time: 00:06:43
Clustering time: 00:00:03
Diversity analysis time: 00:01:12
Arenas creation time: 00:00:00


In [None]:
# Show the results in the output dataframe

df_un

In [None]:
# Save the results

df_un.to_csv("analysis1-un-20000.csv")

In [None]:
# Plot the clusters distributions and cluster size distribution
mf.plotmulticlus(cls_un, 10, 20)

In [None]:
it = range(50000, 300001, 50000)

In [None]:
##########################################################
## Analysis of the conditioned output
##########################################################

df_co, cls_co = mf.wholean(it = it, name_train = "train", name_pref = "con20k-")

In [None]:
# Show the results dataframe

df_co

In [None]:
# Save the results dataframe

df_co.to_csv("analysis1-co-20000.csv")

In [None]:
# Plot the clusters distributions and cluster size distribution

mf.plotmulticlus(cls_co, 10, 20)

In [None]:
### Plot the bunch of histograms of mwt
it = range(50000, 300001, 50000)
mwts = []

for n in it:
    smis = mf.smif2smis('./con20k-' + str(n) + '.smi')
    ncorr, n, smis, wrongsmis = mf.corrsmis(smis)
    smidf = mf.smis2smidf(smis)
    pt.AddMoleculeColumnToFrame(smidf,"smiles")
    smidf['mw'] = smidf['ROMol'].map(Descriptors.MolWt)
    del smidf["ROMol"]
    mwts.append(list(smidf['mw']))

leg = ["# train=50K","# train=100K","# train=150K","# train=200K","# train=250K","# train=300K"] 

mf.paintmultihist(mwts, "MWt", 3, 2, 270, 300, 15, 15, 210, 400, leg)

In [None]:
# End the time counter for the whole notebook
end = time.time()
eltime = end - start
print('Exp1Analysis20000 execution time: ' + time.strftime("%H:%M:%S", time.gmtime(eltime)))