In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import sys,os
sys.path.insert(0,'../')

In [None]:
from ml_tools.descriptors import RawSoapInternal
from ml_tools.models.KRR import KRR,TrainerCholesky,KRRFastCV
from ml_tools.kernels import KernelPower,KernelSum
from ml_tools.utils import get_mae,get_rmse,get_sup,get_spearman,get_score,load_pck,tqdm_cs
from ml_tools.split import KFold,LCSplit,ShuffleSplit
from ml_tools.compressor import FPSFilter

In [None]:
import numpy as np
from ase.io import read,write
from ase.visualize import view

# Build a kernel Matrix

In [None]:
# load the structures
frames = read('data/dft-smiles_500.xyz',':')
global_species = []
for frame in frames:
    global_species.extend(frame.get_atomic_numbers())
global_species = np.unique(global_species)

# split the structures in 2 sets
frames_train = frames[:300]
frames_test = frames[300:]

In [None]:
# set up the soap parameters
soap_params = dict(rc=3.5, nmax=6, lmax=6, awidth=0.4,
                   global_species=global_species,nocenters=[])

representation = RawSoapInternal(**soap_params)

# set up the kernel parameters
kernel = KernelSum(KernelPower(zeta = 2),chunk_shape=[100,100])


In [None]:
# compute the soap vectors
rawsoaps = representation.transform(frames_train)
X_train = dict(feature_matrix=rawsoaps,strides=representation.strides)

# compute the soap vectors
rawsoaps = representation.transform(frames_test)
X_test = dict(feature_matrix=rawsoaps,strides=representation.strides)

In [None]:
# compute the square kernel matrix
Kmat = kernel.transform(X_train)

In [None]:
# compute a rectangular kernel matrix
Kmat_rect = kernel.transform(X_test,X_train)

# FPS selection of the samples

In [None]:
# load the structures
frames = read('data/dft-smiles_500.xyz',':300')
global_species = []
for frame in frames:
    global_species.extend(frame.get_atomic_numbers())
global_species = np.unique(global_species)

In [None]:
# set up the soap parameters
soap_params = dict(rc=3.5, nmax=6, lmax=6, awidth=0.4,
                   global_species=global_species,nocenters=[])

representation = RawSoapInternal(**soap_params)

# set up the kernel parameters
kernel = KernelSum(KernelPower(zeta = 2),chunk_shape=[100,100])


In [None]:
# compute the soap vectors
rawsoaps = representation.transform(frames)
X = dict(feature_matrix=rawsoaps,strides=representation.strides)

In [None]:
# run the fps selection on the set and plot the minmax distance
Nselect = 250
compressor = FPSFilter(Nselect,kernel,act_on='sample',precompute_kernel=True,disable_pbar=True)
compressor.fit(X,dry_run=True)
compressor.plot()

In [None]:
# select the appropriate number of samples to select
compressor.Nselect = 250
# and compress
X_compressed = compressor.transform(X)

In [None]:
compressor.selected_ids[:compressor.Nselect]

In [None]:
X['feature_matrix'].shape

In [None]:
X_compressed['feature_matrix'].shape

In [None]:
X_compressed['strides'].shape

# FPS selection of the features

In [None]:
# load the structures
frames = read('data/dft-smiles_500.xyz',':300')
global_species = []
for frame in frames:
    global_species.extend(frame.get_atomic_numbers())
global_species = np.unique(global_species)

In [None]:
# set up the soap parameters
soap_params = dict(rc=3.5, nmax=6, lmax=6, awidth=0.4,
                   global_species=global_species,nocenters=[])

representation = RawSoapInternal(**soap_params)

# set up the kernel parameters
kernel = KernelPower(zeta = 2)


In [None]:
# compute the soap vectors
X = representation.transform(frames)

In [None]:
# run the fps selection on the set and plot the minmax distance
Nselect = 250
compressor = FPSFilter(Nselect,kernel,act_on='feature',precompute_kernel=True,disable_pbar=True)
compressor.fit(X,dry_run=True)
compressor.plot()

In [None]:
# select the appropriate number of samples to select
compressor.Nselect = 500
# and compress
X_compressed = compressor.transform(X)

In [None]:
compressor.selected_ids[:compressor.Nselect]

# get a cross validation score

In [None]:
# load the structures
frames = read('data/dft-smiles_500.xyz',':')
global_species = []
y = []
for frame in frames:
    global_species.extend(frame.get_atomic_numbers())
    y.append(frame.info['dft_formation_energy_per_atom_in_eV'])
y = np.array(y)
global_species = np.unique(global_species)

In [None]:
# set up the soap parameters
soap_params = dict(rc=3.5, nmax=6, lmax=6, awidth=0.4,
                   global_species=global_species,nocenters=[])

representation = RawSoapInternal(**soap_params)

# set up the kernel parameters
kernel = KernelSum(KernelPower(zeta = 2),chunk_shape=[100,100])

# set the splitting rational
cv = KFold(n_splits=6,random_state=10,shuffle=True)
# set up the regression model
jitter = 1e-8
krr = KRRFastCV(jitter, 1.,cv)

In [None]:
# compute the soap vectors
rawsoaps = representation.transform(frames)
X = dict(feature_matrix=rawsoaps,strides=representation.strides)
rawsoaps.shape

In [None]:
# compute the kernel matrix for the dataset
Kmat = kernel.transform(X)
# fit the model
krr.fit(Kmat,y)
# get the predictions for each folds
y_pred = krr.predict()
# compute the CV score for the dataset
get_score(y_pred,y)

# LC

In [None]:
# load the structures
frames = read('data/dft-smiles_500.xyz',':')
global_species = []
y = []
for frame in frames:
    global_species.extend(frame.get_atomic_numbers())
    y.append(frame.info['dft_formation_energy_per_atom_in_eV'])
y = np.array(y)
global_species = np.unique(global_species)

In [None]:
# set up the soap parameters
soap_params = dict(rc=3.5, nmax=6, lmax=6, awidth=0.4,
                   global_species=global_species,nocenters=[])

representation = RawSoapInternal(**soap_params)

# set up the kernel parameters
kernel = KernelSum(KernelPower(zeta = 2),chunk_shape=[100,100])

# set the splitting rational
trainer = TrainerCholesky(memory_efficient=True)
# set up the regression model
jitter = 1e-8
krr = KRR(jitter,1.,trainer)
train_sizes=[20,50,100]
lc = LCSplit(ShuffleSplit, n_repeats=[20,20,20],train_sizes=train_sizes,test_size=100, random_state=10)

In [None]:
rawsoaps = representation.transform(frames)
X = dict(feature_matrix=rawsoaps,strides=representation.strides)
K = kernel.transform(X)

In [None]:
scores = {size:[] for size in train_sizes}
for train,test in tqdm_cs(lc.split(y),total=lc.n_splits):
    Ntrain = len(train)
    k_train = K[np.ix_(train,train)]
    y_train = y[train]
    k_test = K[np.ix_(test,train)]
    krr.fit(k_train,y_train)
    y_pred = krr.predict(k_test)
    scores[Ntrain].append(get_score(y_pred,y[test]))

In [None]:
sc_name = 'RMSE'
Ntrains = []
avg_scores = []
for Ntrain, score in scores.items():
    avg = 0
    for sc in score:
        avg += sc[sc_name]
    avg /= len(score)
    avg_scores.append(avg)
    Ntrains.append(Ntrain)

In [None]:
plt.plot(Ntrains,avg_scores,'--o')
plt.xlabel('Number of training samples')
plt.ylabel('Test {}'.format(sc_name))
plt.xscale('log')
plt.yscale('log')