In [None]:
import numpy as np
import qml
import sys
sys.path.insert(0, '/home/misa/git_repositories/APDFT/prototyping/atomic_energies/')
import qml_interface as qmi
import sklearn.model_selection as sk
import pickle

def crossvalidate(reps, labels, tr_size, sigma, lam, num_cv):
    errors = []
    for cv in range(num_cv):
        reps_tr, reps_test, labels_tr, labels_test = sk.train_test_split(reps,labels,train_size=tr_size)
        coeffs = qmi.train_kernel(reps_tr, labels_tr, sigma, lam_val)
        labels_predicted = qmi.predict_labels(reps_test, reps_tr, sigma, coeffs)
        errors.append((np.abs(labels_predicted - labels_test)).mean())
    errors = np.array(errors)
    return(errors.mean(), errors.std())

def train_fchl_kernel(rep_tr, labels_tr, sigma, lam_val, molecule_size_selection):
    """
    return coefficients from representation, labels, sigma and lambda
    
    rep_tr: training representations
    labels_tr: training labels
    sigma: kernel width
    lam_val: regularizer
    """
    tr_kernel = generate_fchl_atomic_kernel(rep_tr, molecule_size_selection, sigma)
    reg_kernel = tr_kernel + np.identity(len(tr_kernel))*lam_val
    coeffs = qml.math.cho_solve(reg_kernel, labels_tr)
    return(coeffs)

def generate_fchl_atomic_kernel(reps, molecule_size, sigma):
    """
    produces atomic kernel for all atoms in all molecules
    reps: list of representations for all molecules
    """
    tot_atom_number = molecule_size.sum()
    
    atomic_kernel = np.empty((tot_atom_number, tot_atom_number))
    
    for idx1 in range(len(reps)):
        
        # pick representation at idx1
        # build atomic kernel elements with all representations > idx1-1
        
        for idx2 in range(idx1, len(reps)):
            two_particle_kernel = qml.fchl.fchl_scalar_kernels.get_atomic_kernels(reps[idx1], reps[idx2], alchemy='off', kernel_args={"sigma":[sigma],})[0]#string_mult(reps[idx1], reps[idx2])#
            
            rowindex_start = molecule_size[0:idx1].sum()
            colindex_start = molecule_size[0:idx2].sum()
            
            for ri in zip(range(0, two_particle_kernel.shape[0]), range(rowindex_start, rowindex_start+molecule_size[idx1])):
                for ci in zip(range(0, two_particle_kernel.shape[1]), range(colindex_start, colindex_start+molecule_size[idx2])):
                    atomic_kernel[ri[1], ci[1]] = two_particle_kernel[ri[0], ci[0]]
                    atomic_kernel[ci[1], ri[1]] = atomic_kernel[ri[1], ci[1]]    
    return(atomic_kernel)

def predict_labels_fchl(rep_test, rep_tr, sigma, coeffs, molecule_size_selection):
    """
    predict the labels for given coefficents and training, validation/test set
    """
    kernel = generate_fchl_atomic_kernel(rep_test, rep_tr, sigma, molecule_size_selection)
    prediction = np.dot(kernel, coeffs)
    return(prediction)

def get_molecule_size_selection()

def save_obj(obj, fname ):
    with open(fname, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# data preparation
data, molecule_size = qmi.load_alchemy_data(qmi.wrapper_alch_data())
alch_pots = qmi.generate_label_vector(data, molecule_size.sum(), value='alch_pot')

all_global_fchl_reps = 

In [None]:
# calculation of learning curves
sigmas = np.logspace(-1, 10, 11, base=2)
lam_val = 1e-5
num_cv = 3

lcurves = dict()

# define number of training points for which MAE is calculated
set_sizes = np.logspace(0, 11, 12, base=2).astype(int)

for sigma in sigmas:
    error_cv = []
    error_std = []
    # calculate error for every training point size
    for idx, tr_size in enumerate(set_sizes):
        err, err_std = crossvalidate(all_local_reps, alch_pots, tr_size, sigma, lam_val, num_cv)
        error_cv.append(err)
        error_std.append(err_std)
    
    lcurves[f'sig_{sigma}'] = np.array([set_sizes, error_cv, error_std]).T

In [None]:
# save best learning curve
lowest_error = (None, None)
for k in lcurves.keys():
    if lowest_error[1]==None or lowest_error[1] > np.amin(lcurves[k][:,1]):
        lowest_error = (k, np.amin(lcurves[k][:,1]))
save_data = lcurves[lowest_error[0]]
path = '/home/misa/APDFT/prototyping/atomic_energies/results/analyse_learning/lcurves_alch_pot/fchl/best_all_alchpots_small_sigmas.txt'
sig_val = lowest_error[0].split('_')[1]
header = f'sigma = {sig_val}, lambda = {lam_val}, number cv = {num_cv}'
np.savetxt(path, save_data, delimiter='\t', header=header)

In [None]:
# save dictionary of learning curves at all sigmas
fname = '/home/misa/APDFT/prototyping/atomic_energies/results/analyse_learning/lcurves_alch_pot/fchl/all_sigma_all_alchpots_small_sigmas.txt'
save_obj(lcurves, fname)