In [1]:
import qml
import numpy as np

import sys
sys.path.insert(0, '/home/misa/APDFT/prototyping/atomic_energies/')
import alchemy_tools as alch
import qml_interface as qi

In [32]:
def optimize_hypar_cv(reps, labels, tr_set_size, molecule_size, num_cv=10):
    """
    returns the sigma, lambda values that yield the minimum mean error for a num_cv-fold cross-validation, as well as the mean error
    for these sigma, lambda-values
    
    reps: all representations
    labels: all labels
    tr_set_size: size of the training set
    molecule_size: number of atoms in each molecule
    num_cv: number of sets for cross-validation
    """
    
    sigmas = np.logspace(-1, 4, 12).tolist()
    lams = np.logspace(-15, 0, 16).tolist()
    
    # storage of output of optimization
    opt_data = np.zeros((num_cv, len(sigmas)*len(lams), 3))
    
    for idx in range(0, num_cv):
        global_idc_tr, global_idc_val = qi.get_indices(len(molecule_size), tr_set_size)
        local_idc_tr, local_idc_val = qi.get_local_idx(global_idc_tr, molecule_size), qi.get_local_idx(global_idc_val, molecule_size)
        reps_splitted = reps[local_idc_tr], reps[local_idc_val] # select the representations
        labels_splitted = labels[global_idc_tr], labels[global_idc_val]
        # optimize hyperparameters via grid search
        m_tr = molecule_size[global_idc_tr]
        m_val = molecule_size[global_idc_val]
        
        results = optimize_hypar(reps_splitted, labels_splitted, sigmas, lams, m_tr, m_val)
        opt_data[idx] = results[0]
        
    # find set of hyperparameters with minimum mean error
    mean_errors = opt_data.mean(axis=0)[:,2] # mean error for every set of hyper-paramters
    std = opt_data.std(axis=0)[:,2]
    min_error = np.amin(mean_errors) # minimum mean error
    idx_opt = np.where(mean_errors==min_error) # idx of set of hyperparameters with lowest mean error
    opt_sigma = opt_data[0][idx_opt][0,0] # sigma value for minimum error
    opt_lambda = opt_data[0][idx_opt][0,1] # lambda value for minimum error
    
    
    return(opt_sigma, opt_lambda, min_error, std)
        

def optimize_hypar(rep, labels, sigmas, lams, m_tr, m_val):
    """
    finds the combination of sigma and lambda that yields the minimimum prediction error
    for a given training and validation set
    
    @in:
    rep: tuple containing representations (training set, validation set)
    lables: tuple containing labels (training set, validation set)
    sigmas: list of sigmas that will be tried duirng optimizations
    lams: list of lambdas that will be tried during optimizations
    
    @out:
    mean_errors: tuple (sigma, lambda, corresponding mean error) for all sigma, lambda combinations
    opt_coeffs: coefficients for the sigma, lambda values which yield the lowest mean error
    opt_errors: errors for the sigma, lambda values which yield the lowest mean error
    """
    
    # representations for training and validation
    rep_tr, rep_val = rep
    labels_tr, labels_val = labels
    
    # store validation results
    mean_errors = np.empty( (len(sigmas)*len(lams), 3) )
    
    # optimum coefficients, errors
    opt_coeffs = np.zeros( len(rep_tr) )
    opt_errors = np.zeros( len(rep_val) )
    
    start_idx = 0

    # build kernel for different sigmas
    tr_kernels = qml.kernels.get_local_kernels_gaussian(rep_tr, rep_tr, m_tr, m_tr, sigmas)
    val_kernels = qml.kernels.get_local_kernels_gaussian(rep_val, rep_tr, m_val, m_tr, sigmas)
    
    for idx_s,s in enumerate(sigmas):
        tr_kernel = tr_kernels[idx_s]
    
        for idx_l, l in enumerate(lams):
            reg_kernel = tr_kernel + np.identity(len(tr_kernel))*l
            coeffs = qml.math.cho_solve(reg_kernel, labels_tr)

            # validation
            val_errors = np.abs( np.dot(val_kernels[idx_s], coeffs) - labels_val ) 
            val_err_mean = val_errors.mean()

            # evaluate validation and store data
            mean_errors[start_idx+idx_l] = s, l, val_err_mean

            tmp = mean_errors[:, 2]
            if np.amin(tmp[0:start_idx+idx_l+1]) == mean_errors[start_idx+idx_l][2]:
                opt_coeffs = coeffs
                opt_errors = val_errors
                
            
#            if (start_idx+idx_l == 0) or (mean_errors[start_idx+idx_l,2] < mean_errors[start_idx+idx_l-1,2]):
#                opt_coeffs = coeffs
#                opt_errors = val_errors
            
        start_idx += len(lams)
        
    return( mean_errors, opt_coeffs, opt_errors )

In [44]:
paths=qi.wrapper_alch_data()
alchemy_data, molecule_size = qi.load_alchemy_data(paths)
local_reps = qi.generate_atomic_representations(alchemy_data, molecule_size)
global_reps = qi.wrapper_global_representations(alchemy_data, molecule_size, rep_par='coulomb')
global_labels = qi.generate_label_vector(alchemy_data, len(global_reps), value='atomisation_global')

opt_sigma, opt_lambda, min_error, std=optimize_hypar_cv(local_reps, global_labels, 512, molecule_size, num_cv=10)

In [45]:
opt_sigma

151.99110829529332

In [46]:
opt_lambda

0.1

In [47]:
min_error

0.02500539758767072

In [78]:
std.mean()

0.00989119535496144

In [55]:
def crossvalidate_loc_glob(reps, labels, molecule_size, tr_set_size, sigma, lam_val, num_cross=10):
    """
    calculates the mean error for num_cross randomly selected training sets, returns the mean and std of these mean errors
    for local representation but global labels
    
    reps: representations of training and validation data
    labels: labels of training and validation data
    molecule_size: the number of atoms for every representation
    tr_set_size: the size of the training set
    sigma: the kernel width
    lam_val: the regularizer
    num_cross: the number of cross-validations
    
    error_crossval: the mean error for every cross-validation run
    """
    
    error_crossval = np.zeros(num_cross)
    
    for idx in range(0, num_cross):
        
        # split data into training and validation set
        idc_tr, idc_val = qi.get_indices(len(molecule_size), tr_set_size)
        

        local_idc_tr, local_idc_val = qi.get_local_idx(idc_tr, molecule_size), qi.get_local_idx(idc_val, molecule_size)
        rep_splitted_loc = reps[local_idc_tr], reps[local_idc_val] # select the representations
        labels_splitted_loc = labels[idc_tr], labels[idc_val] # select the labels
        
        # calculate error
        kernel = qml.kernels.get_local_kernels_gaussian(rep_splitted_loc[0], rep_splitted_loc[0], molecule_size[idc_tr], molecule_size[idc_tr], np.array([sigma]))
        reg_kernel = kernel[0] + np.identity(len(kernel[0]))*lam_val
        coeffs = qml.math.cho_solve(reg_kernel, labels_splitted_loc[0])
        
        kernel_val = qml.kernels.get_local_kernels_gaussian(rep_splitted_loc[1], rep_splitted_loc[0], molecule_size[idc_val], molecule_size[idc_tr], np.array([sigma]))
        labels_predicted = np.dot(kernel_val[0], coeffs)
        
        error_crossval[idx] = np.abs(labels_predicted - labels_splitted_loc[1]).mean()
    
    return(error_crossval.mean(), error_crossval.std())

In [49]:
paths=qi.wrapper_alch_data()
alchemy_data, molecule_size = qi.load_alchemy_data(paths)
local_reps = qi.generate_atomic_representations(alchemy_data, molecule_size)
global_reps = qi.wrapper_global_representations(alchemy_data, molecule_size, rep_par='coulomb')
global_labels = qi.generate_label_vector(alchemy_data, len(global_reps), value='atomisation_global')
set_sizes = np.logspace(9, 0, 10, base=2).astype(int)

In [79]:
set_sizes = np.logspace(9, 0, 10, base=2).astype(int)
error = []
std = []

for tr_size in set_sizes:
    e,s_ = crossvalidate_loc_glob(local_reps, global_labels, molecule_size, tr_size, opt_sigma, 1e-1, num_cross=50)
    error.append(e)
    std.append(s_)
error = np.array(error)
std = np.array(std)

In [80]:
error


array([0.02385669, 0.02669428, 0.03098223, 0.03720507, 0.04843995,
       0.05626024, 0.0662674 , 0.06920137, 0.07642515, 0.09640895])

In [81]:
std

array([0.00239671, 0.00092529, 0.00160509, 0.00285816, 0.00815748,
       0.00991736, 0.00952142, 0.01244045, 0.01435878, 0.03822651])

In [82]:
set_sizes

array([512, 256, 128,  64,  32,  16,   8,   4,   2,   1])

In [83]:
big_array = np.array([set_sizes, error, std])
big_array = np.flip(big_array.T, axis=0)
fname = '/home/misa/APDFT/prototyping/atomic_energies/results/analyse_learning/l_curve_local_rep_global_label.tab'
np.savetxt(fname, big_array, delimiter='\t')