In [None]:
import numpy as np
import qml
import sys
sys.path.insert(0, '/home/misa/git_repositories/APDFT/prototyping/atomic_energies/')
import qml_interface as qmi
import utils_qm as uq

import sklearn.model_selection as sk
import pickle

from matplotlib import pyplot as plt

# def crossvalidate(reps, labels, tr_size, sigma, lam, num_cv):
#     errors = []
#     for cv in range(num_cv):
#         reps_tr, reps_test, labels_tr, labels_test = sk.train_test_split(reps,labels,train_size=tr_size)
#         coeffs = qmi.train_kernel(reps_tr, labels_tr, sigma, lam_val)
#         labels_predicted = qmi.predict_labels(reps_test, reps_tr, sigma, coeffs)
#         errors.append((np.abs(labels_predicted - labels_test)).mean())
#     errors = np.array(errors)
#     return(errors.mean(), errors.std())

# def save_obj(obj, fname ):
#     with open(fname, 'wb') as f:
#         pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [None]:
def wrapper_local_idx(global_idx, molecule_sizes):
    item = qmi.get_local_idx([global_idx], molecule_sizes)
    return item

def get_mol_atom_map(molecule_sizes):
    mol_atom_map = dict()
    mol_ids = np.arange(len(molecule_sizes))
    for mid in mol_ids:
        item = wrapper_local_idx(mid, molecule_sizes)
        mol_atom_map[mid] = item
    return(mol_atom_map)

def split_train_test_indices(molecule_sizes, tr_size):
    """
    split indices of atoms in training and test, such that all atoms of a molecule are either in training or test set
    """
    # split molecules by index
    mol_ids = np.arange(len(molecule_sizes))
    splitted_mol_ids = sk.train_test_split(mol_ids,mol_ids,train_size=tr_size)
    mol_id_tr = splitted_mol_ids[0]
    mol_id_test = splitted_mol_ids[1]
    
    # dictionary key:mol index, item:atom indices
    mol_atom_map = get_mol_atom_map(molecule_sizes)
    
    indices_tr = []
    for idx in mol_id_tr:
        indices_tr.extend(mol_atom_map[idx])
        
    indices_test = []
    for idx in mol_id_test:
        indices_test.extend(mol_atom_map[idx])
    return(indices_tr, indices_test, mol_id_test)

def split_molecule_wise(reps, labels, tr_size, molecule_sizes):
    """
    molecules_sizes: numpy 1D-array; contains the lengths of the molecules in the training data
    """
    
    # split only atomic indices
    indices_tr, indices_test, mol_id_test = split_train_test_indices(molecule_sizes, tr_size)
    
    # split reps, labels into training/test using indices
    reps_tr = reps[indices_tr]
    reps_test = reps[indices_test]
    labels_tr = labels[indices_tr]
    labels_test = labels[indices_test]
    
    return(reps_tr, reps_test, labels_tr, labels_test, mol_id_test)

def predict_labels_mol(reps_tr,sigma, coeffs, mol_id_test, mol_atom_map, reps):
    labels = []
    for i in mol_id_test:
        reps_test_mol = reps[mol_atom_map[i]]
        labels_in_mol = qmi.predict_labels(reps_test_mol, reps_tr, sigma, coeffs)
        labels.append(labels_in_mol.sum())
    return(labels)
        
def get_labels_mol_test(labels, mol_id_test, mol_atom_map):
    # per molecule

    labels_mol_test = []
    for idx in mol_id_test:
        idc = mol_atom_map[idx]
        labels_mol_test.append(labels[idc].sum())
    
    return(np.array(labels_mol_test) )
        

def crossvalidate_moleculewise(reps, labels, tr_size, sigma, lam, num_cv, molecule_sizes):
    errors = []
    errors_per_mol = []
    for cv in range(num_cv):
        reps_tr, reps_test, labels_tr, labels_test, mol_id_test = split_molecule_wise(reps, labels, tr_size, molecule_sizes)
        
        coeffs = qmi.train_kernel(reps_tr, labels_tr, sigma, lam_val)
        labels_predicted = qmi.predict_labels(reps_test, reps_tr, sigma, coeffs)
        errors.append((np.abs(labels_predicted - labels_test)).mean())

        
        mol_atom_map = get_mol_atom_map(molecule_sizes)
        labels_predicted_mol = predict_labels_mol(reps_tr,sigma, coeffs, mol_id_test, mol_atom_map, reps)
        labels_mol_test = get_labels_mol_test(labels, mol_id_test, mol_atom_map)
        errors_per_mol.append( (np.abs(labels_predicted_mol-labels_mol_test)).mean() )

    errors = np.array(errors)
    errors_per_mol = np.array(errors_per_mol)
    return(errors.mean(), errors.std(), errors_per_mol.mean(), errors_per_mol.std())

### Elementwise learning

In [1]:
import numpy as np
import sklearn.model_selection as sk

import sys
sys.path.insert(0, '/home/misa/git_repositories/APDFT/prototyping/atomic_energies/')

import qml_interface as qmi

In [2]:
def split_molecule_indices(num_data_pts, tr_size):
    molecule_indices = np.arange(num_data_pts)
    tr_indices_mol, test_indices_mol, dummy1, dummy2 = sk.train_test_split(molecule_indices,molecule_indices,train_size=tr_size)
    return(tr_indices_mol, test_indices_mol)

def sort_indices_by_element(indices, charges):
    """
    list of charges; charges[i] is the charge of the atom with index i
    """
    unique_charges = list(set(charges))
    unique_charges.sort()
    indices_by_charge = {k: [] for k in unique_charges}

    for index in indices:
        ch = charges[index]
        indices_by_charge[ch].append(index)
    return(indices_by_charge)

def get_error_per_molecule(charges, labels_test_by_charge, molecule_sizes, prediction_by_charge, test_indices_by_charge, test_indices_mol):
    # error per molecule
    prediction_per_molecule = []
    label_per_molecule = []

    # get the predictions molecule wise
    for mol_id in test_indices_mol:
        # atomic indices for atoms in test molecule
        atomic_indices_molecule = qmi.get_local_idx([mol_id], molecule_sizes)
        # sort atomic indices of test molecule by charge (then I know where to look for position of index after splitting by charge)
        atomic_indices_molecule_splitted_by_charge = sort_indices_by_element(atomic_indices_molecule, charges)
        # find position of atomic indices in the *_by_charge lists
        unique_charges = list(set(charges))
        unique_charges.sort()
        final_atomic_indices_by_charge = {k: [] for k in unique_charges}
        for k in atomic_indices_molecule_splitted_by_charge.keys():
            tmp_list = atomic_indices_molecule_splitted_by_charge[k]
            for index in tmp_list:
                final_atomic_indices_by_charge[k].append(test_indices_by_charge[k].index(index))
        # get the prediction and the label
        atomic_atomisation_energies_predicted = []
        atomic_atomisation_energies_true = []
        for k in final_atomic_indices_by_charge.keys():
            tmp_list = final_atomic_indices_by_charge[k]
            atomic_atomisation_energies_predicted.extend(prediction_by_charge[k][tmp_list])
            atomic_atomisation_energies_true.extend(labels_test_by_charge[k][tmp_list])

        atomic_atomisation_energies_predicted = np.array(atomic_atomisation_energies_predicted)
        prediction_per_molecule.append(atomic_atomisation_energies_predicted)

        atomic_atomisation_energies_true = np.array(atomic_atomisation_energies_true)
        label_per_molecule.append(atomic_atomisation_energies_true)
    return(prediction_per_molecule, label_per_molecule)

def train_test_cycle_by_charge(all_local_reps, charges, labels, lam_val_by_charge, molecule_sizes, sigma_by_charge, tr_size):
    """
    calculate MAE for a randomly selected training set for given hyperparameters
    """
    ####################################
    # split data in test and training
    ####################################
    
    # first split molecules
    data_size = len(molecule_sizes)
    tr_indices_mol, test_indices_mol = split_molecule_indices(data_size, tr_size)
    
    # get atom indices from molecule indices
    tr_indices_atom = qmi.get_local_idx(tr_indices_mol, molecule_sizes)
    test_indices_atom = qmi.get_local_idx(test_indices_mol, molecule_sizes)
    
    # split atomic indices by charge
    tr_indices_by_charge = sort_indices_by_element(tr_indices_atom, charges)
    test_indices_by_charge = sort_indices_by_element(test_indices_atom, charges)
    
    # debug
    for k in tr_indices_by_charge.keys():
        print(len(tr_indices_by_charge[k]))
    
    # sort training and test atomic representations by charge
    reps_tr_by_charge = dict()
    for k in tr_indices_by_charge.keys():
        reps_tr_by_charge[k] = all_local_reps[tr_indices_by_charge[k]]

    reps_test_by_charge = dict()
    for k in test_indices_by_charge.keys():
        reps_test_by_charge[k] = all_local_reps[test_indices_by_charge[k]]
        
    # sort training and test atomic labels by charge
    labels_tr_by_charge = dict()
    for k in tr_indices_by_charge.keys():
        labels_tr_by_charge[k] = labels[tr_indices_by_charge[k]]

    labels_test_by_charge = dict()
    for k in test_indices_by_charge.keys():
        labels_test_by_charge[k] = labels[test_indices_by_charge[k]]
        
        
    ####################################
    # train and test individual elements
    ####################################
    
    # train elementwise
    coeffs_by_charge = dict()
    for k in reps_tr_by_charge.keys():
        coeffs_by_charge[k] = qmi.train_kernel(reps_tr_by_charge[k], labels_tr_by_charge[k], sigma_by_charge[k], lam_val_by_charge[k])
        
    # predict elementwise
    prediction_by_charge = dict()
    for k in reps_tr_by_charge.keys():
        prediction_by_charge[k] = qmi.predict_labels(reps_test_by_charge[k], reps_tr_by_charge[k], sigma_by_charge[k], coeffs_by_charge[k])
        
    # test error per atom
    error_by_charge = dict()
    for k in labels_test_by_charge.keys():
        error_by_charge[k] = (np.abs(prediction_by_charge[k]-labels_test_by_charge[k])).mean()
    
    ####################################
    # test per molecule
    ####################################
    # get moleculewise prediction
    prediction_per_molecule, label_per_molecule = get_error_per_molecule(charges, labels_test_by_charge, molecule_sizes, prediction_by_charge, test_indices_by_charge, test_indices_mol)
    # calculate error
    error_per_molecule = []
    for p, t in zip(prediction_per_molecule, label_per_molecule):
        error_per_molecule.append((p-t).sum())
    error_per_molecule = np.array(error_per_molecule)
    mae_molecule = np.abs(error_per_molecule).mean()
    
    return(mae_molecule, error_by_charge)
    

In [3]:
# data preparation
paths = qmi.wrapper_alch_data()
exclude='/home/misa/APDFT/prototyping/atomic_energies/results/slice_ve38/dsgdb9nsd_000829/atomic_energies_with_mic.txt'
paths.remove(exclude)
data, molecule_sizes = qmi.load_alchemy_data(paths)
labels = qmi.generate_label_vector(data, molecule_sizes.sum(), value='atomisation')
charges = qmi.generate_label_vector(data, molecule_sizes.sum(), value='charge')

all_local_reps = qmi.generate_atomic_representations(data, molecule_sizes)

In [4]:
lam_val_by_charge = {1.0:1e-5, 6.0:1e-5, 7.0:1e-5, 8.0:1e-5}
sigma_by_charge = {1.0:104, 6.0:104, 7.0:223, 8.0:223}
tr_sizes = [64, 128, 256, 512, 900]
num_cv = 3

In [5]:
train_test_cycle_by_charge(all_local_reps, charges, labels, lam_val_by_charge, molecule_sizes, sigma_by_charge, tr_sizes[-1])

5974
4313
1116
899


(0.11950821502280308,
 {1.0: 0.019850596898851908,
  6.0: 0.021008447188757908,
  7.0: 0.032860571875822704,
  8.0: 0.03372285004064851})

In [None]:
lcurve_mol = []

unique_charges = list(set(charges))
unique_charges.sort()
lcurve_by_charge = {k: [] for k in unique_charges}

for tr_size in tr_sizes:
    error_mol = []
    unique_charges = list(set(charges))
    unique_charges.sort()
    error_by_charge = {k: [] for k in unique_charges}

    for cv in range(num_cv):
        error_mol_tmp, error_by_charge_tmp = train_test_cycle_by_charge(all_local_reps, charges, labels, lam_val_by_charge, molecule_sizes, sigma_by_charge, tr_size)
        error_mol.append(error_mol_tmp)
        for k in error_by_charge_tmp.keys():
            error_by_charge[k].append(error_by_charge_tmp[k])

    mae_mol = (np.array(error_mol)).mean()
    std_mol = (np.array(error_mol)).std()
    mae_by_charge = dict()
    std_by_charge = dict()
    for k in error_by_charge.keys():
        mae_by_charge[k] = (np.array(error_by_charge[k])).mean()
        std_by_charge[k] = (np.array(error_by_charge[k])).std()
        
    lcurve_mol.append([tr_size, mae_mol, std_mol])
    for k in lcurve_by_charge.keys():
        lcurve_by_charge[k].append([tr_size, mae_by_charge[k], std_by_charge[k]])

In [None]:
path_mol = '/home/misa/projects/Atomic-Energies/data/lcurves/lcurves_atomisation/atomic_atomisation_elementwise_per_molecule.txt'
np.savetxt(path_mol, lcurve_mol)

In [None]:
for k in lcurve_by_charge.keys():
    path_atomic = f'/home/misa/projects/Atomic-Energies/data/lcurves/lcurves_atomisation/atomic_atomisation_elementwise_Z_{k}.txt'
    np.savetxt(path_atomic, lcurve_by_charge[k])

### Learning curves calculation

In [None]:
# data preparation
data, molecule_sizes = qmi.load_alchemy_data(qmi.wrapper_alch_data())
labels = qmi.generate_label_vector(data, molecule_sizes.sum(), value='atomisation')

all_local_reps = qmi.generate_atomic_representations(data, molecule_sizes)

In [None]:
opt_sigma = 222.8609442038079
lam_val = 1e-5
num_cv = 10

# define number of training points for which MAE is calculated
set_sizes = np.logspace(0, 9, 10, base=2).astype(int)
set_sizes = np.concatenate((set_sizes, np.array([900])))

error_cv = []
error_std = []

error_cv_mol = []
error_std_mol = []

# calculate error for every training point size
for idx, tr_size in enumerate(set_sizes):
    err, err_std, err_mol, err_std_mol = crossvalidate_moleculewise(all_local_reps, labels, tr_size, opt_sigma, lam_val, num_cv, molecule_sizes)
    error_cv.append(err)
    error_std.append(err_std)
    error_cv_mol.append(err_mol)
    error_std_mol.append(err_std_mol)
    
lcurves = np.array([set_sizes, error_cv, error_std]).T
lcurves_mol = np.array([set_sizes, error_cv_mol, error_std_mol]).T

In [None]:
fname = '/home/misa/projects/Atomic-Energies/data/lcurves/lcurves_atomisation/opt_sigma_all_atomic_atomisation.txt'
np.savetxt(fname, lcurves)
fname_mol = '/home/misa/projects/Atomic-Energies/data/lcurves/lcurves_atomisation/opt_sigma_all_atomic_atomisation_per_molecule.txt'
np.savetxt(fname_mol, lcurves_mol)

### Hyperparameter optimization

In [None]:
# data preparation
data, molecule_sizes = qmi.load_alchemy_data(qmi.wrapper_alch_data())
labels = qmi.generate_label_vector(data, molecule_sizes.sum(), value='atomisation')

all_local_reps = qmi.generate_atomic_representations(data, molecule_sizes)

In [None]:
sigmas = np.logspace(-1, 10, 11, base=2)
lam_val = 1e-5
num_cv = 3

lcurves = dict()
lcurves_mol = dict()

# define number of training points for which MAE is calculated
#set_sizes = #np.logspace(0, 9, 10, base=2).astype(int)
set_sizes = [900]#np.concatenate((set_sizes, np.array([900])))

for sigma in sigmas:
    error_cv = []
    error_std = []
    
    error_cv_mol = []
    error_std_mol = []
    
    # calculate error for every training point size
    for idx, tr_size in enumerate(set_sizes):
        err, err_std, err_mol, err_std_mol = crossvalidate_moleculewise(all_local_reps, labels, tr_size, sigma, lam_val, num_cv, molecule_sizes)
        error_cv.append(err)
        error_std.append(err_std)
        error_cv_mol.append(err_mol)
        error_std_mol.append(err_std_mol)
    
    lcurves[f'sig_{sigma}'] = np.array([set_sizes, error_cv, error_std]).T
    lcurves_mol[f'sig_{sigma}'] = np.array([set_sizes, error_cv_mol, error_std_mol]).T

In [None]:
fname = '/home/misa/projects/Atomic-Energies/data/lcurves/lcurves_atomisation/all_sigmas_all_atomic_atomisation'
uq.save_obj(lcurves, fname)

fname_mol = '/home/misa/projects/Atomic-Energies/data/lcurves/lcurves_atomisation/all_sigmas_all_atomic_atomisation_per_molecule'
uq.save_obj(lcurves_mol, fname_mol)

### Elementwise prediction

In [None]:
import numpy as np
import qml
import sys
sys.path.insert(0, '/home/misa/git_repositories/APDFT/prototyping/atomic_energies/')
import qml_interface as qmi
import sklearn.model_selection as sk
import pickle
import os

def crossvalidate(reps, labels, tr_size, sigma, lam, num_cv):
    errors = []
    for cv in range(num_cv):
        reps_tr, reps_test, labels_tr, labels_test = sk.train_test_split(reps,labels,train_size=tr_size)
        coeffs = qmi.train_kernel(reps_tr, labels_tr, sigma, lam_val)
        labels_predicted = qmi.predict_labels(reps_test, reps_tr, sigma, coeffs)
        errors.append((np.abs(labels_predicted - labels_test)).mean())
    errors = np.array(errors)
    return(errors.mean(), errors.std())

def save_obj(obj, fname ):
    with open(fname, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
    
def get_tr_size(data_size):
    """
    largest number of training points is roughly 90% of complete data (largest multiple of 2 that is <= 90%)
    """
    largest_set = int(np.log2(data_size*0.9))
    tr_size = np.logspace(0, largest_set, largest_set+1, base=2).astype(int)
    return(tr_size)

def get_element_symbol(Z):
    if int(Z) == 1:
        return('H')
    elif int(Z) == 6:
        return('C')
    elif int(Z) == 7:
        return('N')
    elif int(Z) == 8:
        return('O')
    elif int(Z) == 9:
        return('F')
    else:
        raise ValueError('Symbol for given charge not available')

#######################################################################################################
# CHECK BEFORE RUNNING 
#######################################################################################################

# property which will be predicted
PROPERTY_TO_LEARN = 'atomisation'
# hyperparameter values
sigmas = {1.0:[104], 6.0:[104], 7.0:[223], 8.0:[223]}
lam_val = 1e-5 # regularizer, no list possible right now
num_cv = 3 # number crossvalidations

# path where best sigma will be saved to
path_best = f'/home/misa/projects/Atomic-Energies/data/lcurves/lcurves_atomisation/'
# path where all sigmas will be saved to
path_all = f'/home/misa/projects/Atomic-Energies/data/lcurves/lcurves_atomisation/'

#######################################################################################################

# data preparation
paths = qmi.wrapper_alch_data()
exclude='/home/misa/APDFT/prototyping/atomic_energies/results/slice_ve38/dsgdb9nsd_000829/atomic_energies_with_mic.txt'
paths.remove(exclude)
data, molecule_size = qmi.load_alchemy_data(paths)

alch_pots = qmi.generate_label_vector(data, molecule_size.sum(), value=PROPERTY_TO_LEARN)

all_local_reps = qmi.generate_atomic_representations(data, molecule_size)

# split up alchemical potential by element
charges = qmi.generate_label_vector(data, molecule_size.sum(), value='charge')
idc_by_charge = qmi.partition_idx_by_charge(charges)

el_reps =dict()
el_alch_pots = dict()
for k in idc_by_charge.keys():
    el_reps[k] = all_local_reps[idc_by_charge[k]]
    el_alch_pots[k] = alch_pots[idc_by_charge[k]]
    
for charge in el_reps.keys():
    lcurves = dict()

    # define number of training points for which MAE is calculated
    #set_sizes = np.concatenate((get_tr_size(len(el_alch_pots[charge])), np.array([int(len(el_alch_pots[charge])*0.9)])) )
    set_sizes = np.concatenate((get_tr_size(len(el_alch_pots[charge])), np.array([int(len(el_alch_pots[charge])*0.9)])) )
    print(set_sizes, flush=True)
    
    # special for H
#     set_sizes = np.concatenate((set_sizes, np.array([3300])))

    for sigma in sigmas[charge]:
        error_cv = []
        error_std = []
        # calculate error for every training point size
        for idx, tr_size in enumerate(set_sizes):
            err, err_std = crossvalidate(el_reps[charge], el_alch_pots[charge], tr_size, sigma, lam_val, num_cv)
            error_cv.append(err)
            error_std.append(err_std)

        lcurves[f'sig_{sigma}'] = np.array([set_sizes, error_cv, error_std]).T
        
    
    # save best learning curve
    lowest_error = (None, None)
    for k in lcurves.keys():
        if lowest_error[1]==None or lowest_error[1] > np.amin(lcurves[k][:,1]):
            lowest_error = (k, np.amin(lcurves[k][:,1]))
    save_data = lcurves[lowest_error[0]]

    # filename
    el_symbol = get_element_symbol(charge)
    path = os.path.join(path_best, f'best_{PROPERTY_TO_LEARN}_{el_symbol}_b2.txt')

    sig_val = lowest_error[0].split('_')[1]
    header = f'sigma = {sig_val}, lambda = {lam_val}, number cv = {num_cv}'
    np.savetxt(path, save_data, delimiter='\t', header=header)