In [1]:
import numpy as np
import qml
import sys
sys.path.insert(0, '/home/misa/git_repositories/APDFT/prototyping/atomic_energies/')
import qml_interface as qmi
import utils_qm as uq

import sklearn.model_selection as sk
import pickle

from matplotlib import pyplot as plt

# def crossvalidate(reps, labels, tr_size, sigma, lam, num_cv):
#     errors = []
#     for cv in range(num_cv):
#         reps_tr, reps_test, labels_tr, labels_test = sk.train_test_split(reps,labels,train_size=tr_size)
#         coeffs = qmi.train_kernel(reps_tr, labels_tr, sigma, lam_val)
#         labels_predicted = qmi.predict_labels(reps_test, reps_tr, sigma, coeffs)
#         errors.append((np.abs(labels_predicted - labels_test)).mean())
#     errors = np.array(errors)
#     return(errors.mean(), errors.std())

# def save_obj(obj, fname ):
#     with open(fname, 'wb') as f:
#         pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [2]:
def wrapper_local_idx(global_idx, molecule_sizes):
    item = qmi.get_local_idx([global_idx], molecule_sizes)
    return item

def get_mol_atom_map(molecule_sizes):
    mol_atom_map = dict()
    mol_ids = np.arange(len(molecule_sizes))
    for mid in mol_ids:
        item = wrapper_local_idx(mid, molecule_sizes)
        mol_atom_map[mid] = item
    return(mol_atom_map)

def split_train_test_indices(molecule_sizes, tr_size):
    """
    split indices of atoms in training and test, such that all atoms of a molecule are either in training or test set
    """
    # split molecules by index
    mol_ids = np.arange(len(molecule_sizes))
    splitted_mol_ids = sk.train_test_split(mol_ids,mol_ids,train_size=tr_size)
    mol_id_tr = splitted_mol_ids[0]
    mol_id_test = splitted_mol_ids[1]
    
    # dictionary key:mol index, item:atom indices
    mol_atom_map = get_mol_atom_map(molecule_sizes)
    
    indices_tr = []
    for idx in mol_id_tr:
        indices_tr.extend(mol_atom_map[idx])
        
    indices_test = []
    for idx in mol_id_test:
        indices_test.extend(mol_atom_map[idx])
    return(indices_tr, indices_test, mol_id_test)

def split_molecule_wise(reps, labels, tr_size, molecule_sizes):
    """
    molecules_sizes: numpy 1D-array; contains the lengths of the molecules in the training data
    """
    
    # split only atomic indices
    indices_tr, indices_test, mol_id_test = split_train_test_indices(molecule_sizes, tr_size)
    
    # split reps, labels into training/test using indices
    reps_tr = reps[indices_tr]
    reps_test = reps[indices_test]
    labels_tr = labels[indices_tr]
    labels_test = labels[indices_test]
    
    return(reps_tr, reps_test, labels_tr, labels_test, mol_id_test)

def predict_labels_mol(reps_tr,sigma, coeffs, mol_id_test, mol_atom_map, reps):
    labels = []
    for i in mol_id_test:
        reps_test_mol = reps[mol_atom_map[i]]
        labels_in_mol = qmi.predict_labels(reps_test_mol, reps_tr, sigma, coeffs)
        labels.append(labels_in_mol.sum())
    return(labels)
        
def get_labels_mol_test(labels, mol_id_test, mol_atom_map):
    # per molecule

    labels_mol_test = []
    for idx in mol_id_test:
        idc = mol_atom_map[idx]
        labels_mol_test.append(labels[idc].sum())
    
    return(np.array(labels_mol_test) )
        

def crossvalidate_moleculewise(reps, labels, tr_size, sigma, lam, num_cv, molecule_sizes):
    errors = []
    errors_per_mol = []
    for cv in range(num_cv):
        reps_tr, reps_test, labels_tr, labels_test, mol_id_test = split_molecule_wise(reps, labels, tr_size, molecule_sizes)
        
        coeffs = qmi.train_kernel(reps_tr, labels_tr, sigma, lam_val)
        labels_predicted = qmi.predict_labels(reps_test, reps_tr, sigma, coeffs)
        errors.append((np.abs(labels_predicted - labels_test)).mean())

        
        mol_atom_map = get_mol_atom_map(molecule_sizes)
        labels_predicted_mol = predict_labels_mol(reps_tr,sigma, coeffs, mol_id_test, mol_atom_map, reps)
        labels_mol_test = get_labels_mol_test(labels, mol_id_test, mol_atom_map)
        errors_per_mol.append( (np.abs(labels_predicted_mol-labels_mol_test)).mean() )

    errors = np.array(errors)
    errors_per_mol = np.array(errors_per_mol)
    return(errors.mean(), errors.std(), errors_per_mol.mean(), errors_per_mol.std())

### Learning curves calculation

In [6]:
# data preparation
data, molecule_sizes = qmi.load_alchemy_data(qmi.wrapper_alch_data())
labels = qmi.generate_label_vector(data, molecule_sizes.sum(), value='atomisation')

all_local_reps = qmi.generate_atomic_representations(data, molecule_sizes)

In [15]:
opt_sigma = 222.8609442038079
lam_val = 1e-5
num_cv = 10

# define number of training points for which MAE is calculated
set_sizes = np.logspace(0, 9, 10, base=2).astype(int)
set_sizes = np.concatenate((set_sizes, np.array([900])))

error_cv = []
error_std = []

error_cv_mol = []
error_std_mol = []

# calculate error for every training point size
for idx, tr_size in enumerate(set_sizes):
    err, err_std, err_mol, err_std_mol = crossvalidate_moleculewise(all_local_reps, labels, tr_size, opt_sigma, lam_val, num_cv, molecule_sizes)
    error_cv.append(err)
    error_std.append(err_std)
    error_cv_mol.append(err_mol)
    error_std_mol.append(err_std_mol)
    
lcurves = np.array([set_sizes, error_cv, error_std]).T
lcurves_mol = np.array([set_sizes, error_cv_mol, error_std_mol]).T

In [22]:
fname = '/home/misa/projects/Atomic-Energies/data/lcurves/lcurves_atomisation/opt_sigma_all_atomic_atomisation.txt'
np.savetxt(fname, lcurves)
fname_mol = '/home/misa/projects/Atomic-Energies/data/lcurves/lcurves_atomisation/opt_sigma_all_atomic_atomisation_per_molecule.txt'
np.savetxt(fname_mol, lcurves_mol)

### Hyperparameter optimization

In [None]:
# data preparation
data, molecule_sizes = qmi.load_alchemy_data(qmi.wrapper_alch_data())
labels = qmi.generate_label_vector(data, molecule_sizes.sum(), value='atomisation')

all_local_reps = qmi.generate_atomic_representations(data, molecule_sizes)

In [None]:
sigmas = np.logspace(-1, 10, 11, base=2)
lam_val = 1e-5
num_cv = 3

lcurves = dict()
lcurves_mol = dict()

# define number of training points for which MAE is calculated
#set_sizes = #np.logspace(0, 9, 10, base=2).astype(int)
set_sizes = [900]#np.concatenate((set_sizes, np.array([900])))

for sigma in sigmas:
    error_cv = []
    error_std = []
    
    error_cv_mol = []
    error_std_mol = []
    
    # calculate error for every training point size
    for idx, tr_size in enumerate(set_sizes):
        err, err_std, err_mol, err_std_mol = crossvalidate_moleculewise(all_local_reps, labels, tr_size, sigma, lam_val, num_cv, molecule_sizes)
        error_cv.append(err)
        error_std.append(err_std)
        error_cv_mol.append(err_mol)
        error_std_mol.append(err_std_mol)
    
    lcurves[f'sig_{sigma}'] = np.array([set_sizes, error_cv, error_std]).T
    lcurves_mol[f'sig_{sigma}'] = np.array([set_sizes, error_cv_mol, error_std_mol]).T

In [None]:
fname = '/home/misa/projects/Atomic-Energies/data/lcurves/lcurves_atomisation/all_sigmas_all_atomic_atomisation'
uq.save_obj(lcurves, fname)

fname_mol = '/home/misa/projects/Atomic-Energies/data/lcurves/lcurves_atomisation/all_sigmas_all_atomic_atomisation_per_molecule'
uq.save_obj(lcurves_mol, fname_mol)