In [None]:
import qml
import numpy as np
import itertools
import sys
sys.path.insert(0, '/home/misa/APDFT/prototyping/atomic_energies/')
from parse_density_files import CUBE
import alchemy_tools as alch
import qml_interface as qi
import matplotlib
matplotlib.use('Qt5Agg')
from matplotlib import pyplot as plt
plt.rcParams.update({'font.size': 22})

In [None]:
# Import data
paths=qi.wrapper_alch_data(path='/home/misa/APDFT/prototyping/atomic_energies/results/slice_ve38/finished_abs')
alchemy_data, molecule_size = qi.load_alchemy_data(paths)


In [None]:
# np.savetxt('/home/misa/APDFT/prototyping/atomic_energies/analysis/global_idc_test_analyse_errors.txt', global_idc[1])
g_tr = np.loadtxt('/home/misa/APDFT/prototyping/atomic_energies/analysis/global_idc_tr_analyse_errors.txt').astype(int)
g_test = np.loadtxt('/home/misa/APDFT/prototyping/atomic_energies/analysis/global_idc_test_analyse_errors.txt').astype(int)

In [None]:
# select training and validation set
training_set_size = 100
global_idc = g_tr, g_test#qi.get_indices(len(alchemy_data), training_set_size)
local_idc = qi.get_local_idx(global_idc[0], molecule_size), qi.get_local_idx(global_idc[1], molecule_size)

# build representations and labels
local_reps = qi.generate_atomic_representations(alchemy_data, molecule_size)
local_labels = qi.generate_label_vector(alchemy_data, molecule_size.sum())

# optimize model
rep = local_reps[local_idc[0]], local_reps[local_idc[1]] # select the representations
labels = local_labels[local_idc[0]], local_labels[local_idc[1]] # select the labels
sigmas = np.logspace(-1, 4, 12) # list of hyperparameters for optimization
lams = np.logspace(-15, 0, 16)
opt_data, coeffs, errors = qi.optimize_hypar(rep, labels, sigmas, lams)

best_sigma = opt_data[np.where(opt_data[:,2]==np.amin(opt_data[:,2]))][0][0]
best_lambda = opt_data[np.where(opt_data[:,2]==np.amin(opt_data[:,2]))][0][1]

In [None]:
errors.mean()
opt_data

# The error in atomisation energy per molecule

In [None]:
# predicted atomic energergies
atomic_energies = qi.predict_labels(rep[1], rep[0], best_sigma, coeffs)
# true atomisation energies
ref_atomisation_en = np.empty(len(global_idc[1]))
for j,i in enumerate(global_idc[1]):
    ref_atomisation_en[j] = alchemy_data[i][:,6].sum()

# molecule size of molecules in test set
molecule_size_test = molecule_size[global_idc[1]]

# error per molecule
error_molecules = qi.calculate_error_atomisation_energy(atomic_energies, molecule_size_test, labels[1])
# error_molecules_cspline = qi.calculate_error_atomisation_energy(atomic_energies, molecule_size_test, ref_atomisation_en)

error_molecules.mean()

In [None]:
0.4509086224586748

In [None]:
paths[np.where(error_molecules==np.max(error_molecules))[0][0]]
# np.max(error_molecules_cspline)

In [None]:
# test calculate_error_atomisation_energy

# predict the energies for one molecule ('/home/misa/APDFT/prototyping/atomic_energies/results/slice_ve38/dsgdb9nsd_000275/atomic_energies.txt')
# ae = qi.predict_labels(local_reps[qi.get_local_idx([global_idc[1][2]], molecule_size)], rep[0], best_sigma, coeffs)
# # subtract the sum of the predicted energies from the sum of the correct atomic energies
# np.abs(ae.sum()-alchemy_data[4][:,6].sum())
# error_molecules
# compare value to the value in error molecules the third element for this random subset both values are the same
# indicating that the scheme works correctly

# Error Integration

In [None]:
trapz= error_molecules

In [None]:
cspline = error_molecules_cspline.copy()

In [None]:
print(cspline.mean())
print(trapz.mean())

In [None]:
plt.plot(np.arange(len(trapz)),trapz - cspline, 'o')
plt.plot(np.arange(len(trapz)), np.zeros(len(trapz)), color='black')
plt.xlabel('Molecule ID')
plt.ylabel(r'Error(trapz) - Error(cspline)')


# Error distribution per molecule

In [None]:
# histogram
import pandas as pd
import matplotlib
matplotlib.use('Qt5Agg')
from matplotlib import pyplot as plt
plt.rcParams.update({'font.size': 22})

binwidth = error_molecules.std()/5
total_width = np.amax(error_molecules)-np.amin(error_molecules)

fhist, axhist = plt.subplots(1,1)
axhist.hist(error_molecules, bins = 2*int(total_width/binwidth), edgecolor='black', density=True)
s = pd.Series(error_molecules)
axhist = s.plot.kde(bw_method=0.07)
axhist.set_xlabel('Error per molecule [Ha]')
axhist.set_ylabel('Frequency')

# Mean distance of validation molecules to training molecules

In [None]:
# average distance of validation to training representations
mean_d = np.zeros(len(local_idc[1]))

for j,i in enumerate(local_idc[1]):
    mean_d[j] = qml.distance.l2_distance(rep[0], np.array([local_reps[i]])).mean()

In [None]:
mean_d

In [None]:
# average distance of validation molecules to training molecules

# molecule size of molecules in test set
molecule_size_test = molecule_size[global_idc[1]]

# sum up atomic energies
mean_d_molecule = np.zeros(len(molecule_size_test))
start = 0
for idx, size in enumerate(molecule_size_test):
    mean_d_molecule[idx] = mean_d[start:start+size].sum()/size
    start += size

In [None]:
plt.plot(molecule_size_test, error_molecules/molecule_size_test, 'o')

In [None]:

local_reps_89 =  qml.representations.generate_atomic_coulomb_matrix(alchemy_data[285][:,0], alchemy_data[285][:,[1,2,3]], size=np.amax(molecule_size), sorting='distance')

# average distance of validation to training representations
qml.distance.l2_distance(rep[0], np.array(local_reps_89)).mean()

In [None]:
local_reps_89 =  qml.representations.generate_atomic_coulomb_matrix(alchemy_data[285][:,0], alchemy_data[285][:,[1,2,3]], size=np.amax(molecule_size), sorting='distance')

local_reps_89.shape

# select similiar molecules with different error

In [None]:
# select four molecules around distance = 143
sel_y=mean_d_molecule[np.where( (mean_d_molecule<143.5) & (mean_d_molecule>142.3) )]
sel_x=trapz[np.where( (mean_d_molecule<143.5) & (mean_d_molecule>142.3) )]

In [None]:
np.where( (mean_d_molecule<143.5) & (mean_d_molecule>142.3) )

In [None]:
sel_x

In [None]:
idx=[0, 72, 229, 424]
trapz[idx]

In [None]:
# select 3 molecules around distance = 128 (normalized error)
large_error = np.where( (mean_d_molecule<129.5) & (mean_d_molecule>128) & (error_molecules>2) )
small_error = np.where( (mean_d_molecule<129.5) & (mean_d_molecule>128) & (error_molecules<0.01) )
np.array([large_error[0],small_error[0]])
idx_128= np.array([small_error[0][0], large_error[0][0], large_error[0][1]])

In [None]:
fig, ax = plt.subplots(1,1)
ax.plot(trapz,mean_d_molecule, 'o')
# ax.plot(trapz[idx]/molecule_size_test[idx],mean_d_molecule[idx], 'o')
# ax.plot(trapz[idx_128]/molecule_size_test[idx_128],mean_d_molecule[idx_128], 'o')

# ax.plot(cspline[idx], mean_d_molecule[idx], 'o', label='cspline integration')
h_high = np.zeros(len(np.linspace(-10,10, 20)))
h_high.fill(143)
h_low = np.zeros(len(np.linspace(-10,10, 20)))
h_low.fill(1300)
ax.plot(np.linspace(-0,0.5, 20), h_high, color='darkorange')
# ax.plot(np.linspace(-0,5, 20), h_low)
ax.set_xlabel('Error per molecule [Ha]')
ax.set_ylabel('Mean distance of test molecule to the training molecules')

In [None]:
# global index of molecules
global_idc[1][idx]

In [None]:
idx=[0, 72, 229, 424]
trapz[idx]

In [None]:
# test if selected molecules have indeed same distance but different errors
max_size = np.amax(molecule_size)
idx_selected = [285, 89, 515, 0]
for i in idx_selected:
    molecule = alchemy_data[i]
    # make rep
    rep_selected = qml.representations.generate_atomic_coulomb_matrix(molecule[:,0], molecule[:,[1,2,3]], size=max_size, sorting='distance')
    # error per molecule
    en_selected = qi.predict_labels(rep_selected, rep[0], best_sigma, coeffs).sum()
    error = np.abs(molecule[:,6].sum()-en_selected)
    # error normalized by molecule size
    error_norml = error/molecule_size[i]
    # mean distance to training data
    mean_dist = qml.distance.l2_distance(rep_selected, rep[0]).mean()
    
    print('Comopund {}'.format(paths[i].split('/')[-2]))
    print('Error = {}'.format(error))
    print('Error per atom = {}'.format(error_norml))
    print('Mean distance = {}'.format(mean_dist))
    print('\n')

In [None]:
# molecule 128

idx_selected = global_idc[1][idx_128]
for i in idx_selected:
    molecule = alchemy_data[i]
    # make rep
    rep_selected = qml.representations.generate_atomic_coulomb_matrix(molecule[:,0], molecule[:,[1,2,3]], size=max_size, sorting='distance')
    # error per molecule
    en_selected = qi.predict_labels(rep_selected, rep[0], best_sigma, coeffs).sum()
    error = np.abs(molecule[:,6].sum()-en_selected)
    # error normalized by molecule size
    error_norml = error/molecule_size[i]
    # mean distance to training data
    mean_dist = qml.distance.l2_distance(rep_selected, rep[0]).mean()
    
    print('Comopund {}'.format(paths[i].split('/')[-2]))
    print('Error = {}'.format(error))
    print('Error per atom = {}'.format(error_norml))
    print('Mean distance = {}'.format(mean_dist))
    print('\n')

# Error vs mean distance from training data

In [None]:
fig, ax = plt.subplots(1,1)
ax.plot(mean_d_molecule, trapz, 'o')
ax.set_ylabel('Error per molecule [Ha]')
ax.set_xlabel('Mean distance of test molecule to the training molecules')

In [None]:
err = trapz-cspline
x=np.arange(len(err))
plt.plot(mean_d_molecule, err, 'o')

# Molecules with biggest Error

In [None]:
# molecules with biggest error
error_molecules[np.where(error_molecules>3.0)]
idx_largest_error = global_idc[1][np.where(error_molecules>3.0)]

#
for i in idx_largest_error:
    print(i, paths[i])

# Error distribution atomic energies

In [None]:
import qml
import numpy as np
import itertools
import sys
sys.path.insert(0, '/home/misa/APDFT/prototyping/atomic_energies/')
from parse_density_files import CUBE
import alchemy_tools as alch
import qml_interface as qi
import matplotlib
matplotlib.use('Qt5Agg')
from matplotlib import pyplot as plt
plt.rcParams.update({'font.size': 22})

In [None]:
# Import data
paths=qi.wrapper_alch_data(path='/home/misa/APDFT/prototyping/atomic_energies/results/slice_ve38/finished_abs')
alchemy_data, molecule_size = qi.load_alchemy_data(paths)
# local data
local_reps = qi.generate_atomic_representations(alchemy_data, molecule_size)
local_labels = qi.generate_label_vector(alchemy_data, molecule_size.sum(), value='atomisation')

# optimized hyperparameters
# hypar = '/home/misa/APDFT/prototyping/atomic_energies/results/analyse_learning/optimized_hyperparameters.txt'
# with open(hypar, 'r') as f:
#     lines = f.readlines()
sigma = 432.8761281083057 #float(lines[3].split('\t')[0])
lam = 1e-7 # float(lines[3].split('\t')[1])
# sigma_h = float(lines[4].split('\t')[0])
# lam_h = float(lines[4].split('\t')[1])

In [None]:
errors = []
nuc_charges = []
num_cross = 30
tr_set_size = 512

for i in range(0, num_cross):
    # select random data points
    # split data into training and validation set
    global_idc_tr, global_idc_val = qi.get_indices(len(molecule_size), tr_set_size)
    local_idc_tr, local_idc_val = qi.get_local_idx(global_idc_tr, molecule_size), qi.get_local_idx(global_idc_val, molecule_size)
    for idx in global_idc_val:
        nuc_charges.extend(alchemy_data[idx][:,0])
    
    
    rep_tr, rep_test = local_reps[local_idc_tr], local_reps[local_idc_val] # select the representations
    labels_tr, labels_test = local_labels[local_idc_tr], local_labels[local_idc_val] # select the labels
    
    # train and predict
    coeffs = qi.train_kernel(rep_tr, labels_tr, sigma, lam)
    labels_pred = qi.predict_labels(rep_test, rep_tr, sigma, coeffs)
    # calculate errors
    errors.extend(labels_test - labels_pred)

In [None]:
np.array(errors).shape

# Crossvalidation only Hydrogen

In [None]:
# # learn only hydrogen energies from hydrogen
# h_errors = []

# num_cv = 10
# tr_set_size = 500

# sigmas = np.logspace(-1, 4, 12).tolist()
# lams = np.logspace(-15, 0, 16).tolist()
# opt_data = np.zeros((num_cv, len(sigmas)*len(lams), 3))

# for i in range(0, num_cv):
#     # select random data points
#     # split data into training and validation set
#     global_idc_tr, global_idc_val = qi.get_indices(len(molecule_size), tr_set_size)
#     local_idc_tr, local_idc_val = qi.get_local_idx(global_idc_tr, molecule_size), qi.get_local_idx(global_idc_val, molecule_size)
    
#     # get indices of hydrogen in training and test set
#     nuc_charges_test = []
#     for idx in global_idc_val:
#         nuc_charges_test.extend(alchemy_data[idx][:,0])
#     nuc_charges_test = np.array(nuc_charges_test).astype(int)
    
#     nuc_charges_tr = []
#     for idx in global_idc_tr:
#         nuc_charges_tr.extend(alchemy_data[idx][:,0])
#     nuc_charges_tr = np.array(nuc_charges_tr).astype(int)
    
    
#     rep_tr, rep_test = local_reps[local_idc_tr], local_reps[local_idc_val] # select the representations
#     labels_tr, labels_test = local_labels[local_idc_tr], local_labels[local_idc_val] # select the labels
    
#     rep_tr_h, rep_test_h = rep_tr[np.where(nuc_charges_tr==1)], rep_test[np.where(nuc_charges_test==1)]
#     labels_tr_h, labels_test_h = labels_tr[np.where(nuc_charges_tr==1)], labels_test[np.where(nuc_charges_test==1)]
    
#     # optimize hyperparamters
#     results = qi.optimize_hypar((rep_tr_h, rep_test_h), (labels_tr_h, labels_test_h), sigmas, lams)
#     opt_data[i] = results[0]

In [None]:
# # find set of hyperparameters with minimum mean error
# mean_errors = opt_data.mean(axis=0)[:,2] # mean error for every set of hyper-paramters
# std = opt_data.std(axis=0)[:,2]
# min_error = np.amin(mean_errors) # minimum mean error
# idx_opt = np.where(mean_errors==min_error) # idx of set of hyperparameters with lowest mean error
# opt_sigma = opt_data[0][idx_opt][0,0] # sigma value for minimum error
# opt_lambda = opt_data[0][idx_opt][0,1] # lambda value for minimum error

In [None]:
# learn only hydrogen energies from hydrogen
h_errors = []

num_cv = 10
tr_set_size = 500

for i in range(0, num_cv):
    # select random data points
    # split data into training and validation set
    global_idc_tr, global_idc_val = qi.get_indices(len(molecule_size), tr_set_size)
    local_idc_tr, local_idc_val = qi.get_local_idx(global_idc_tr, molecule_size), qi.get_local_idx(global_idc_val, molecule_size)
    
    # get indices of hydrogen in training and test set
    nuc_charges_test = []
    for idx in global_idc_val:
        nuc_charges_test.extend(alchemy_data[idx][:,0])
    nuc_charges_test = np.array(nuc_charges_test).astype(int)
    
    nuc_charges_tr = []
    for idx in global_idc_tr:
        nuc_charges_tr.extend(alchemy_data[idx][:,0])
    nuc_charges_tr = np.array(nuc_charges_tr).astype(int)
    
    
    rep_tr, rep_test = local_reps[local_idc_tr], local_reps[local_idc_val] # select the representations
    labels_tr, labels_test = local_labels[local_idc_tr], local_labels[local_idc_val] # select the labels
    
    rep_tr_h, rep_test_h = rep_tr[np.where(nuc_charges_tr==1)], rep_test[np.where(nuc_charges_test==1)]
    labels_tr_h, labels_test_h = labels_tr[np.where(nuc_charges_tr==1)], labels_test[np.where(nuc_charges_test==1)]
   
    # train and predict
    coeffs = qi.train_kernel(rep_tr_h, labels_tr_h, sigma_h, lam_h)
    labels_pred = qi.predict_labels(rep_test_h, rep_tr_h, sigma_h, coeffs)
    # calculate errors
    h_errors.extend(labels_test_h - labels_pred)

In [None]:
h_errors = np.array(h_errors)

In [None]:
atomic_errors = np.array(errors)
nuc_charges = np.array(nuc_charges).astype(int)

In [None]:
# histogram
import pandas as pd
import scipy as sp
import matplotlib
matplotlib.use('Qt5Agg')
from matplotlib import pyplot as plt
plt.rcParams.update({'font.size': 22})

binwidth = atomic_errors.std()/20
total_width = np.amax(atomic_errors)-np.amin(atomic_errors)

# complete graph
fhist, axhist = plt.subplots(1,1)
axhist.hist(atomic_errors, bins = int(total_width/binwidth), density=False, label = 'combined error')
axhist.set_xlabel('Error per atom [Ha]')
axhist.set_ylabel('Frequency')
axhist.set_xlim(-0.75,0.75)
# axhist.set_ylim(0,20)

# split up by element
elements = list(set(nuc_charges))
elements.sort()

for el in elements:
    errors_element = atomic_errors[np.where(nuc_charges==el)]

    binwidth = atomic_errors.std()/10
    total_width = np.amax(atomic_errors)-np.amin(atomic_errors)
    axhist.hist(errors_element, bins = int(total_width/binwidth), density=False, label = 'Z = {}'.format(el))
    
axhist.legend()



In [None]:
for el in elements:
    errors_element = atomic_errors[np.where(nuc_charges==el)]
    print('Number training points Z = {}: {}'.format(el, len(errors_element)))

In [None]:
# Mean errors
for el in elements:
    errors_element = atomic_errors[np.where(nuc_charges==el)]
    print('Mean error Z = {} : {} '.format(el, np.abs(errors_element).mean()))
    
print('Mean error only hydrogen: {}'.format(np.abs(h_errors).mean()))

In [None]:
# only kde distribution function
fhist, axhist = plt.subplots(1,1)
s = pd.Series(atomic_errors)
axhist = s.plot.kde(bw_method=0.1)
axhist.set_xlabel('Error per atom [Ha]')
axhist.set_xlim(-0.15,0.15)
axhist.set_ylabel('Frequency')
axhist.set_ylim(0,18)

# split up by element
for el in set(nuc_charges):
    errors_element = atomic_errors[np.where(nuc_charges==el)]

    s = pd.Series(errors_element)
    axhist = s.plot.kde()

In [None]:
# export kdes
import scipy as sc
total_error_kde=sc.stats.gaussian_kde(pd.Series(atomic_errors))

kde_elements = []
# split up by element
list_c = list(set(nuc_charges))
list_c.sort()
for el in list_c:
    errors_element = atomic_errors[np.where(nuc_charges==el)]

    s = pd.Series(errors_element)
    kde_elements.append(sc.stats.gaussian_kde(s))


In [None]:
base_path = '/home/misa/APDFT/prototyping/atomic_energies/results/analyse_learning/error_distributions_atomic_atomisation_energies/kde_error_distribution_atomic_energy_'
paths = [base_path+'H.txt', base_path+'C.txt', base_path+'N.txt', base_path+'O.txt']
for e in zip(kde_elements, paths):
   np.savetxt(e[1], np.array([np.linspace(-1, 1, 1000), e[0].evaluate(np.linspace(-1, 1, 1000))]).T, delimiter='\t')

In [None]:
np.savetxt(base_path+'total.txt',  np.array([np.linspace(-1, 1, 1000), total_error_kde.evaluate(np.linspace(-1, 1, 1000))]).T, delimiter='\t')

In [None]:
atomic_errors.mean()

In [None]:
# cumulated error distribution function
atomic_errors_sorted = atomic_errors.copy()
atomic_errors_sorted.sort()

fig_cum, ax_cum = plt.subplots(1,1)
out = sp.stats.cumfreq(atomic_errors_sorted,20*int(total_width/binwidth))
x = np.arange(len(out.cumcount))*out.binsize+(out.lowerlimit+out.binsize/2)
ax_cum.plot(x,out.cumcount/np.amax(out.cumcount), label='combined error')

# show center for ideal function
horizontal_x = np.linspace(np.amin(x), 0.0, 50)
horizontal_y = np.array([0.5]*len(horizontal_x))
ax_cum.plot(horizontal_x, horizontal_y, '--', color='grey')

vertical_y = np.linspace(0.0, 0.5, 50)
vertical_x = np.zeros(len(vertical_y))
ax_cum.plot(vertical_x, vertical_y, '--', color='grey')
ax_cum.scatter(0.0, 0.5, color='grey', marker='x')

# plt.xlim(-0.2,0.2)
# plt.ylim(0,1)

plt.xlabel('Error (Ha)')
plt.ylabel('CDF(Error)')

# show center of our function
x0=np.where(out.cumcount/np.amax(out.cumcount)<0.5)[0][-1]
normalized=out.cumcount/np.amax(out.cumcount)
m=(normalized[x0+1]-normalized[x0])/(x[x0+1]-x[x0])
b = normalized[x0]-m*x[x0]
ax_cum.scatter((0.5-b)/m, 0.5, color='red', marker='x')

# split up by element
elements = list(set(nuc_charges))
elements.sort()

# split up by element
for el in elements:
    errors_element = atomic_errors[np.where(nuc_charges==el)]

    errors_element_sorted = errors_element.copy()
    errors_element_sorted.sort()
    
    # cumulated error distribution function
    out = sp.stats.cumfreq(errors_element_sorted,20*int(total_width/binwidth))
    x = np.arange(len(out.cumcount))*out.binsize+(out.lowerlimit+out.binsize/2)
    ax_cum.plot(x,out.cumcount/np.amax(out.cumcount), label = 'Z = {}'.format(el) )
    
    x0=np.where(out.cumcount/np.amax(out.cumcount)<0.5)[0][-1]
    normalized=out.cumcount/np.amax(out.cumcount)
    m=(normalized[x0+1]-normalized[x0])/(x[x0+1]-x[x0])
    b = normalized[x0]-m*x[x0]
    ax_cum.scatter((0.5-b)/m, 0.5, color='red', marker='x')
    
ax_cum.set_xlim(-0.2, 0.2)
ax_cum.set_ylim(0, 1.0)
ax_cum.legend()

In [None]:
for el in set(nuc_charges):
    print(el, np.amin(atomic_errors[np.where(nuc_charges==el)]))
    print(el, np.amax(atomic_errors[np.where(nuc_charges==el)]))

In [None]:
# zoom in main peak
fhist, axhist = plt.subplots(1,1)
axhist.hist(atomic_errors, bins = 20*int(total_width/binwidth), density=True)
s = pd.Series(atomic_errors)
axhist = s.plot.kde(bw_method=0.07)
axhist.set_xlabel('Error per atom [Ha]')
axhist.set_xlim(-0.1,0.1)
axhist.set_ylabel('Frequency')
axhist.set_ylim(0,20)
# only kde distribution function
fhist, axhist = plt.subplots(1,1)
s = pd.Series(atomic_errors)
axhist = s.plot.kde(bw_method=0.1)
axhist.set_xlabel('Error per atom [Ha]')
axhist.set_xlim(-0.3,0.3)
axhist.set_ylabel('Frequency')
axhist.set_ylim(0,20)