In [5]:
import glob
import cclib
import pandas as pd
from qml.fchl import generate_representation
import numpy as np
import re

numbers = re.compile(r'(\d+)')
def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

In [8]:
reps = []
energies = []
for out in sorted(glob.iglob('molecules/*/Orca/*.out'), key=numericalSort):
    name = out.split('es/')[1].split('/Orca')[0]
    pt = out.split('Orca/')[1].split('/orca')[0]
    data = cclib.io.ccread(out)
    at_num = data.atomnos
    coords = data.atomcoords[-1]
#         coords = coords.tolist()
    rep = generate_representation(coords, at_num)
#     print(rep)
    actE = data.scfenergies[-1]
    reps.append(rep)
    energies.append(actE)

In [10]:
X = np.array(reps)
y = np.array(energies)

In [12]:
X_train = X[:300]
y_train = y[:300]

X_test = X[301:-1]
y_test = y[301:-1]

In [53]:
from qml.fchl import get_local_kernels

# You can get kernels for multiple kernel-widths
sigma = [2.5, 100]

# Calculate the kernel-matrices for each sigma
K = get_local_kernels(X_train, X_train, sigma, cut_distance=10.0)[0]

print(K.shape)

(300, 300)


In [38]:
from qml.math import cho_solve
# Add a small lambda to the diagonal of the kernel matrix
K[np.diag_indices_from(K)] += 1e-8

# Use the built-in Cholesky-decomposition to solve
alpha = cho_solve(K, y_train)

print(alpha)

[ 5.73996041e+05 -2.70159765e+06  4.12111616e+06 -9.91342963e+05
 -3.03672353e+06  5.88979963e+05  5.02678402e+06 -4.55586595e+06
 -1.82844415e+06  2.26189467e+06 -3.14906973e+06  1.23225790e+06
 -6.31106841e+03 -7.08614261e+04 -1.14509512e+05 -1.47833737e+05
  9.72505976e+05 -1.28334344e+06  5.52437185e+05 -3.09259834e+04
  2.64837812e+05 -6.73353418e+05  1.47413493e+06 -1.79661890e+06
  6.48762763e+04  1.56058969e+06 -5.29231917e+04  1.26585215e+03
 -5.23374930e+06  5.80286137e+06  3.93420586e+06 -1.25264225e+07
  1.07096475e+07 -3.58715602e+06  1.13483737e+06 -6.14935634e+05
  3.52047376e+05 -5.20369999e+05 -4.80416308e+06  1.20456958e+07
 -1.03256305e+07  9.55412375e+05  6.42212768e+06 -5.26333520e+06
  6.31651442e+05 -3.69042815e+05  1.68789662e+06 -2.99322164e+06
  2.07307709e+06  6.65665341e+05 -1.05775280e+06  9.44545968e+05
  5.99761006e+06 -8.87404997e+06  6.91408250e+06 -2.17568437e+06
 -1.93506463e+06  3.20495212e+06 -1.35681165e+06 -2.60821881e+06
  8.01847037e+06 -1.33015

In [39]:
# Calculate the kernel-matrices for each sigma
K_pred = get_local_kernels(X_test, X_train, sigma, cut_distance=10.0)[0]
pred = np.dot(K_pred, alpha)

In [41]:
len(pred)

283

In [43]:
len(y_test)

283

In [45]:
mae = np.mean(np.abs(y_test - pred))
print(mae)

232.48559466640114


### Different K

In [57]:
# Calculate the kernel-matrices for each sigma
K = get_local_kernels(X_train, X_train, sigma, cut_distance=10.0)[0]
print(K.shape)

(300, 300)


In [58]:
# Add a small lambda to the diagonal of the kernel matrix
K[np.diag_indices_from(K)] += 1e-8

# Use the built-in Cholesky-decomposition to solve
alpha = cho_solve(K, y_train)

# Calculate the kernel-matrices for each sigma
K_pred = get_local_kernels(X_test, X_train, sigmas, cut_distance=10.0)[0]
pred = np.dot(K_pred, alpha)

mae = np.mean(np.abs(y_test - pred))
print(mae)

232.48559426675516


In [62]:
df = pd.DataFrame({'Energy': y_test, 'Pred': pred})
df

Unnamed: 0,Energy,Pred
0,-2128.035514,-2123.241943
1,-2128.032858,-2115.036545
2,-2128.028518,-2105.388748
3,-2128.037976,-2102.246765
4,-2128.026577,-2107.732239
...,...,...
278,-1518.308108,-1859.406273
279,-1518.240095,-1857.794182
280,-1518.180197,-1858.943657
281,-1518.117057,-1860.307114


In [60]:
import pandas as pd