In [40]:
import qml
import numpy as np
from parse_cube_files import CUBE
import itertools
import sys
sys.path.insert(0, '/home/misa/APDFT/prototyping/atomic_energies/')
import alchemy_tools as alch
import qml_interface as qi
import matplotlib
matplotlib.use('Qt5Agg')
from matplotlib import pyplot as plt
plt.rcParams.update({'font.size': 22})

In [106]:
# Import data
paths=qi.wrapper_alch_data(path='/home/misa/APDFT/prototyping/atomic_energies/results/slice_ve38/cspline')
alchemy_data, molecule_size = qi.load_alchemy_data(paths)


In [107]:
# np.savetxt('/home/misa/APDFT/prototyping/atomic_energies/analysis/global_idc_test_analyse_errors.txt', global_idc[1])
g_tr = np.loadtxt('/home/misa/APDFT/prototyping/atomic_energies/analysis/global_idc_tr_analyse_errors.txt').astype(int)
g_test = np.loadtxt('/home/misa/APDFT/prototyping/atomic_energies/analysis/global_idc_test_analyse_errors.txt').astype(int)

In [108]:
# select training and validation set
training_set_size = 100
global_idc = g_tr, g_test#qi.get_indices(len(alchemy_data), training_set_size)
local_idc = qi.get_local_idx(global_idc[0], molecule_size), qi.get_local_idx(global_idc[1], molecule_size)

# build representations and labels
local_reps = qi.generate_atomic_representations(alchemy_data, molecule_size)
local_labels = qi.generate_label_vector(alchemy_data, molecule_size.sum())

# optimize model
rep = local_reps[local_idc[0]], local_reps[local_idc[1]] # select the representations
labels = local_labels[local_idc[0]], local_labels[local_idc[1]] # select the labels
sigmas = np.logspace(-1, 4, 12) # list of hyperparameters for optimization
lams = np.logspace(-15, 0, 16)
opt_data, coeffs, errors = qi.optimize_hypar(rep, labels, sigmas, lams)

best_sigma = opt_data[np.where(opt_data[:,2]==np.amin(opt_data[:,2]))][0][0]
best_lambda = opt_data[np.where(opt_data[:,2]==np.amin(opt_data[:,2]))][0][1]

# The error in atomisation energy per molecule

In [110]:
# predicted atomic energergies
atomic_energies = qi.predict_labels(rep[1], rep[0], best_sigma, coeffs)
# true atomisation energies
ref_atomisation_en = np.empty(len(global_idc[1]))
for j,i in enumerate(global_idc[1]):
    ref_atomisation_en[j] = alchemy_data[i][:,6].sum()

# molecule size of molecules in test set
molecule_size_test = molecule_size[global_idc[1]]

# error per molecule
# error_molecules = qi.calculate_error_atomisation_energy(atomic_energies, molecule_size_test, ref_atomisation_en)
error_molecules_cspline = qi.calculate_error_atomisation_energy(atomic_energies, molecule_size_test, ref_atomisation_en)

error_molecules_cspline.mean()

0.6866674183169938

In [55]:
paths[np.where(error_molecules==np.max(error_molecules))[0][0]]
np.max(error_molecules)

4.078284197615917

In [22]:
# test calculate_error_atomisation_energy

# predict the energies for one molecule ('/home/misa/APDFT/prototyping/atomic_energies/results/slice_ve38/dsgdb9nsd_000275/atomic_energies.txt')
# ae = qi.predict_labels(local_reps[qi.get_local_idx([global_idc[1][2]], molecule_size)], rep[0], best_sigma, coeffs)
# # subtract the sum of the predicted energies from the sum of the correct atomic energies
# np.abs(ae.sum()-alchemy_data[4][:,6].sum())
# error_molecules
# compare value to the value in error molecules the third element for this random subset both values are the same
# indicating that the scheme works correctly

# Error Integration

In [23]:
trapz= error_molecules

In [111]:
cspline = error_molecules_cspline.copy()

In [112]:
print(cspline.mean())
print(trapz.mean())

0.6866674183169938
0.6918728401035154


In [117]:
plt.plot(np.arange(len(trapz)),trapz - cspline, 'o')
plt.plot(np.arange(len(trapz)), np.zeros(len(trapz)), color='black')
plt.xlabel('Molecule ID')
plt.ylabel(r'Error(trapz) - Error(cspline)')


Text(0, 0.5, 'Error(trapz) - Error(cspline)')

# Error distribution per molecule

In [26]:
# histogram
import pandas as pd
import matplotlib
matplotlib.use('Qt5Agg')
from matplotlib import pyplot as plt
plt.rcParams.update({'font.size': 22})

binwidth = error_molecules.std()/5
total_width = np.amax(error_molecules)-np.amin(error_molecules)

fhist, axhist = plt.subplots(1,1)
axhist.hist(error_molecules, bins = 2*int(total_width/binwidth), edgecolor='black', density=True)
s = pd.Series(error_molecules)
axhist = s.plot.kde(bw_method=0.07)
axhist.set_xlabel('Error per molecule [Ha]')
axhist.set_ylabel('Frequency')

Text(0, 0.5, 'Frequency')

# Mean distance of validation molecules to training molecules

In [27]:
# average distance of validation to training representations
mean_d = np.zeros(len(local_idc[1]))

for j,i in enumerate(local_idc[1]):
    mean_d[j] = qml.distance.l2_distance(rep[0], np.array([local_reps[i]])).mean()

In [28]:
mean_d

array([147.52750347, 144.95658528, 147.52750837, ..., 126.37090842,
       130.09914829, 130.48050634])

In [29]:
# average distance of validation molecules to training molecules

# molecule size of molecules in test set
molecule_size_test = molecule_size[global_idc[1]]

# sum up atomic energies
mean_d_molecule = np.zeros(len(molecule_size_test))
start = 0
for idx, size in enumerate(molecule_size_test):
    mean_d_molecule[idx] = mean_d[start:start+size].sum()/size
    start += size

In [60]:
plt.plot(molecule_size_test, error_molecules/molecule_size_test, 'o')

[<matplotlib.lines.Line2D at 0x7f3d8423f2e8>]

In [31]:

local_reps_89 =  qml.representations.generate_atomic_coulomb_matrix(alchemy_data[285][:,0], alchemy_data[285][:,[1,2,3]], size=np.amax(molecule_size), sorting='distance')

# average distance of validation to training representations
qml.distance.l2_distance(rep[0], np.array(local_reps_89)).mean()

143.1042598019563

In [32]:
local_reps_89 =  qml.representations.generate_atomic_coulomb_matrix(alchemy_data[285][:,0], alchemy_data[285][:,[1,2,3]], size=np.amax(molecule_size), sorting='distance')

local_reps_89.shape

(11, 210)

# select similiar molecules with different error

In [33]:
# select four molecules around distance = 143
sel_y=mean_d_molecule[np.where( (mean_d_molecule<143.5) & (mean_d_molecule>142.3) )]
sel_x=trapz[np.where( (mean_d_molecule<143.5) & (mean_d_molecule>142.3) )]

In [34]:
np.where( (mean_d_molecule<143.5) & (mean_d_molecule>142.3) )

(array([  0,  27,  72,  79,  84, 130, 206, 229, 424, 429]),)

In [35]:
sel_x

array([3.70468391, 0.86805304, 1.15594204, 1.07574091, 1.4992409 ,
       0.95394156, 0.69862475, 0.05434429, 2.33831833, 1.88909508])

In [36]:
idx=[0, 72, 229, 424]
trapz[idx]

array([3.70468391, 1.15594204, 0.05434429, 2.33831833])

In [101]:
# select 3 molecules around distance = 128 (normalized error)
large_error = np.where( (mean_d_molecule<129.5) & (mean_d_molecule>128) & (error_molecules>2) )
small_error = np.where( (mean_d_molecule<129.5) & (mean_d_molecule>128) & (error_molecules<0.01) )
np.array([large_error[0],small_error[0]])
idx_128= np.array([small_error[0][0], large_error[0][0], large_error[0][1]])

In [102]:
fig, ax = plt.subplots(1,1)
ax.plot(trapz/molecule_size_test,mean_d_molecule, 'o')
ax.plot(trapz[idx]/molecule_size_test[idx],mean_d_molecule[idx], 'o')
ax.plot(trapz[idx_128]/molecule_size_test[idx_128],mean_d_molecule[idx_128], 'o')

# ax.plot(cspline[idx], mean_d_molecule[idx], 'o', label='cspline integration')
h_high = np.zeros(len(np.linspace(-10,10, 20)))
h_high.fill(143)
h_low = np.zeros(len(np.linspace(-10,10, 20)))
h_low.fill(1300)
ax.plot(np.linspace(-0,0.5, 20), h_high, color='darkorange')
# ax.plot(np.linspace(-0,5, 20), h_low)
ax.set_xlabel('Error per molecule [Ha]')
ax.set_ylabel('Mean distance of test molecule to the training molecules')

Text(0, 0.5, 'Mean distance of test molecule to the training molecules')

In [182]:
# global index of molecules
global_idc[1][idx]

array([  0,  89, 285, 515])

In [188]:
idx=[0, 72, 229, 424]
trapz[idx]

array([3.70468391, 1.15594204, 0.05434429, 2.33831833])

In [61]:
# test if selected molecules have indeed same distance but different errors
max_size = np.amax(molecule_size)
idx_selected = [285, 89, 515, 0]
for i in idx_selected:
    molecule = alchemy_data[i]
    # make rep
    rep_selected = qml.representations.generate_atomic_coulomb_matrix(molecule[:,0], molecule[:,[1,2,3]], size=max_size, sorting='distance')
    # error per molecule
    en_selected = qi.predict_labels(rep_selected, rep[0], best_sigma, coeffs).sum()
    error = np.abs(molecule[:,6].sum()-en_selected)
    # error normalized by molecule size
    error_norml = error/molecule_size[i]
    # mean distance to training data
    mean_dist = qml.distance.l2_distance(rep_selected, rep[0]).mean()
    
    print('Comopund {}'.format(paths[i].split('/')[-2]))
    print('Error = {}'.format(error))
    print('Error per atom = {}'.format(error_norml))
    print('Mean distance = {}'.format(mean_dist))
    print('\n')

Comopund dsgdb9nsd_002967
Error = 0.0543442929389677
Error per atom = 0.004940390267178882
Mean distance = 143.1042598019563


Comopund dsgdb9nsd_001212
Error = 1.155942037699334
Error per atom = 0.09632850314161116
Mean distance = 142.83199543243526


Comopund dsgdb9nsd_003886
Error = 2.3383183313433378
Error per atom = 0.23383183313433378
Mean distance = 142.41730418263174


Comopund dsgdb9nsd_000227
Error = 3.7046839098185984
Error per atom = 0.2315427443636624
Mean distance = 143.44335817649247




In [104]:
# molecule 128

idx_selected = global_idc[1][idx_128]
for i in idx_selected:
    molecule = alchemy_data[i]
    # make rep
    rep_selected = qml.representations.generate_atomic_coulomb_matrix(molecule[:,0], molecule[:,[1,2,3]], size=max_size, sorting='distance')
    # error per molecule
    en_selected = qi.predict_labels(rep_selected, rep[0], best_sigma, coeffs).sum()
    error = np.abs(molecule[:,6].sum()-en_selected)
    # error normalized by molecule size
    error_norml = error/molecule_size[i]
    # mean distance to training data
    mean_dist = qml.distance.l2_distance(rep_selected, rep[0]).mean()
    
    print('Comopund {}'.format(paths[i].split('/')[-2]))
    print('Error = {}'.format(error))
    print('Error per atom = {}'.format(error_norml))
    print('Mean distance = {}'.format(mean_dist))
    print('\n')

Comopund dsgdb9nsd_003335
Error = 0.007379768052611091
Error per atom = 0.0004919845368407394
Mean distance = 128.67216927871743


Comopund dsgdb9nsd_009030
Error = 2.451976110556201
Error per atom = 0.24519761105562007
Mean distance = 129.00704793136768


Comopund dsgdb9nsd_021325
Error = 4.078284198416049
Error per atom = 0.4078284198416049
Mean distance = 128.60324895674344




# Error vs mean distance from training data

In [67]:
fig, ax = plt.subplots(1,1)
ax.plot(mean_d_molecule, trapz, 'o')
ax.set_ylabel('Error per molecule [Ha]')
ax.set_xlabel('Mean distance of test molecule to the training molecules')

Text(0.5, 0, 'Mean distance of test molecule to the training molecules')

In [43]:
err = trapz-cspline
x=np.arange(len(err))
plt.plot(mean_d_molecule, err, 'o')

[<matplotlib.lines.Line2D at 0x7f7aa5dfe240>]

# Molecules with biggest Error

In [12]:
# molecules with biggest error
error_molecules[np.where(error_molecules>3.0)]
idx_largest_error = global_idc[1][np.where(error_molecules>3.0)]

#
for i in idx_largest_error:
    print(i, paths[i])

0 /home/misa/APDFT/prototyping/atomic_energies/results/slice_ve38/dsgdb9nsd_000227/atomic_energies_cspline.txt
2 /home/misa/APDFT/prototyping/atomic_energies/results/slice_ve38/dsgdb9nsd_000272/atomic_energies_cspline.txt
209 /home/misa/APDFT/prototyping/atomic_energies/results/slice_ve38/dsgdb9nsd_002308/atomic_energies_cspline.txt
556 /home/misa/APDFT/prototyping/atomic_energies/results/slice_ve38/dsgdb9nsd_021325/atomic_energies_cspline.txt


# Error distribution per atom

In [13]:
# histogram
import pandas as pd
import matplotlib
matplotlib.use('Qt5Agg')
from matplotlib import pyplot as plt
plt.rcParams.update({'font.size': 22})

binwidth = errors.std()/5
total_width = np.amax(errors)-np.amin(errors)

fhist, axhist = plt.subplots(1,1)
axhist.hist(errors, bins = 20*int(total_width/binwidth), density=True)
s = pd.Series(errors)
axhist = s.plot.kde(bw_method=0.07)
axhist.set_xlabel('Error per atom [Ha]')
axhist.set_ylabel('Frequency')

Text(0, 0.5, 'Frequency')