In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import sys,os
# Modify to your own path
sys.path.insert(0,'/local/git/ml_tools/')

In [None]:
from ml_tools.descriptors import RawSoapInternal
from ml_tools.models.KRR import KRR,TrainerCholesky,KRRFastCV
from ml_tools.kernels.kernels import KernelPower,KernelSum
from ml_tools.utils import get_mae,get_rmse,get_sup,get_spearman,get_score,load_pck

In [None]:
import numpy as np
import ase
from ase.io import read,write
from ase.visualize import view
from glob import glob
from copy import copy
from tqdm import tqdm_notebook
import cPickle as pck

# Build a kernel Matrix

In [None]:
# load the structures
frames = read('data/dft-smiles_500.xyz',':')
global_species = []
for frame in frames:
    global_species.extend(frame.get_atomic_numbers())
global_species = np.unique(global_species)

frames_train = frames[:300]
frames_test = frames[300:]

In [None]:
# set up the parameters
soap_params = dict(rc=3.5, nmax=6, lmax=6, awidth=0.4,
                   global_species=global_species,nocenters=[])

kernel = KernelSum(KernelPower(zeta = 2),chunk_shape=[100,100])
representation = RawSoapInternal(**soap_params)

In [None]:
# compute the soap vectors
rawsoaps = representation.transform(frames_train)
X_train = dict(feature_matrix=rawsoaps,strides=representation.strides)

# compute the soap vectors
rawsoaps = representation.transform(frames_test)
X_test = dict(feature_matrix=rawsoaps,strides=representation.strides)

In [None]:
# compute the square kernel matrix
Kmat = kernel.transform(X)

In [None]:
# compute a rectangular kernel matrix
Kmat_rect = kernel.transform(X_test,X)

# FPS selection of the samples

In [None]:
sp = 1
path = './'
xyzPath = path + ''
fn = xyzPath + 'CSD500.xyz'
frames_train = read(fn,index=':10')
global_species=[1, 6, 7, 8]
nocenters = copy(global_species)
nocenters.remove(sp)
prop = []
for cc in frames_train:
    numb = cc.get_atomic_numbers()
    prop.extend(cc.get_array('CS')[numb==sp])
y_train = np.array(prop)

In [None]:
delta = np.std(y_train)
zeta = 2
Nselect = 6

soap_params = dict(rc=5, nmax=12, lmax=9, awidth=0.4,
                    cutoff_transition_width=0.5,
                    cutoff_dexp=0, cutoff_scale=1,cutoff_rate=1,
                    centerweight=1.,global_species=global_species,nocenters=nocenters,
                   fast_avg=False,is_sparse=False,disable_pbar=True)

kernel = KernelPower(zeta,delta)
representation = RawSoapQUIP(**soap_params)
compressor = FPSFilter(Nselect,kernel,act_on='sample',precompute_kernel=True,disable_pbar=True)

In [None]:
rawsoaps = representation.transform(frames_train)
rawsoaps.shape

In [None]:
compressor.fit(rawsoaps,dry_run=True)
compressor.plot()

In [None]:
Nselect = 250
compressor = FPSFilter(Nselect,kernel,act_on='sample',precompute_kernel=True,disable_pbar=True)
compressor.fit(rawsoaps)

# Filter equivalent atoms

In [None]:
sp = 1
path = './'
xyzPath = path + ''
fn = xyzPath + 'CSD500.xyz'
frames_train = read(fn,index=':50')
global_species=[1, 6, 7, 8]
nocenters = copy(global_species)
nocenters.remove(sp)
prop = []
for cc in frames_train:
    numb = cc.get_atomic_numbers()
    prop.extend(cc.get_array('CS')[numb==sp])
y_train = np.array(prop)

In [None]:
delta = np.std(y_train)
zeta = 2

soap_params = dict(rc=5, nmax=9, lmax=6, awidth=0.4,
                    cutoff_transition_width=0.5,
                    cutoff_dexp=0, cutoff_scale=1,cutoff_rate=1,
                    centerweight=1.,global_species=global_species,nocenters=nocenters,
                   fast_avg=False,is_sparse=False,disable_pbar=True)
representation = RawSoapQUIP(**soap_params)

In [None]:
rawsoaps = representation.transform(frames_train)

In [None]:
filt = SymmetryFilter(threshold=1e-4,species=[sp])
X_filt, y_filt = filt.fit_transform(dict(frames=frames_train,feature_matrix=rawsoaps),y=y_train)
print X_filt.shape,filt.strides[-1]
print len(filt.filter_ids_inv)
a,b = filt.inverse_transform(X_filt,y_filt)
print np.allclose(rawsoaps,a,atol=1e-4),np.allclose(y_train,b,atol=1e-4)



In [None]:
plt.plot(y_train-b,'o',ms=1)

# CV

In [None]:
sp = 1
path = './'
xyzPath = path + ''
fn = xyzPath + 'CSD500.xyz'
frames = read(fn,index=':100')
global_species=[1, 6, 7, 8]
nocenters = copy(global_species)
nocenters.remove(sp)
prop = []
for cc in frames:
    numb = cc.get_atomic_numbers()
    prop.extend(cc.get_array('CS')[numb==sp])
y = np.array(prop)
mapping = get_sp_mapping(frames,sp)

In [None]:
delta = np.std(y)*0.01
zeta = 2
jitter = 1e-8

soap_params = dict(rc=5, nmax=12, lmax=9, awidth=0.4,
                    cutoff_transition_width=0.5,
                    cutoff_dexp=0, cutoff_scale=1,cutoff_rate=1,
                    centerweight=1.,global_species=global_species,nocenters=nocenters,
                   fast_avg=False,is_sparse=False,disable_pbar=True)

kernel = KernelPower(zeta,delta)
trainer = TrainerCholesky(memory_efficient=True)
representation = RawSoapQUIP(**soap_params)


cv = EnvironmentalKFold(n_splits=6,random_state=10,shuffle=True,mapping=mapping)
krr = KRRFastCV(jitter,cv)

In [None]:
rawsoaps = representation.transform(frames)

In [None]:
Kmat = kernel.transform(rawsoaps)
krr.fit(Kmat,y)
y_pred = krr.predict()
get_score(y_pred,y)

## test with sparse kernel

### ref

In [None]:
sp = 1
path = './'
xyzPath = path + ''
fn = xyzPath + 'CSD500.xyz'
frames = read(fn,index=':100')
global_species=[1, 6, 7, 8]
nocenters = copy(global_species)
nocenters.remove(sp)
prop = []
for cc in frames:
    numb = cc.get_atomic_numbers()
    prop.extend(cc.get_array('CS')[numb==sp])
y = np.array(prop)
mapping = get_sp_mapping(frames,sp)

In [None]:
delta = np.std(y)*0.01
zeta = 2
jitter = 1e-8
Nselect = 10

soap_params = dict(rc=5, nmax=12, lmax=9, awidth=0.4,
                    cutoff_transition_width=0.5,
                    cutoff_dexp=0, cutoff_scale=1,cutoff_rate=1,
                    centerweight=1.,global_species=global_species,nocenters=nocenters,
                   fast_avg=False,is_sparse=False,disable_pbar=True)

kern = KernelPower(zeta)
trainer = TrainerCholesky(memory_efficient=True)
representation = RawSoapQUIP(**soap_params)
compressor = FPSFilter(Nselect,kern,act_on='sample',precompute_kernel=True,disable_pbar=True)
#filt = SymmetryFilter(threshold=1e-4,species=[sp])

cv = EnvironmentalKFold(n_splits=6,random_state=10,shuffle=True,mapping=mapping)
krr = KRR(jitter,delta,trainer)

In [None]:
rawsoaps = representation.transform(frames)

In [None]:
#X_filt, y_filt = filt.fit_transform(dict(frames=frames,feature_matrix=rawsoaps),y=y)

In [None]:
compressor.fit(rawsoaps,dry_run=True)
compressor.plot()

In [None]:
X_pseudo = rawsoaps[compressor.selected_ids[:3000]]
kernel = KernelSparseSoR(kern,X_pseudo=X_pseudo,Lambda=0.1)
scores = []
error = []
y_pred = np.ones(y.shape)
for train,test in cv.split(rawsoaps):
    Ktrain,ytrain = kernel.transform(X=rawsoaps[train],y=y[train]) 
    Ktest = kernel.transform(X=rawsoaps[test]) 
    krr.fit(Ktrain,ytrain)
    y_pred[test] = krr.predict(Ktest)
    #error.extend(y_pred-y[test])
    #scores.append(get_score(y_pred,y[test]))
#score = (np.mean(np.abs(error)),np.sqrt(np.mean(np.square(error))))
score = get_score(y_pred,y)
print score

# LC

In [None]:
sp = 1
path = './'
xyzPath = path + ''
fn = xyzPath + 'CSD500.xyz'
frames = read(fn,index=':20')
global_species=[1, 6, 7, 8]
nocenters = copy(global_species)
nocenters.remove(sp)
prop = []
for cc in frames:
    numb = cc.get_atomic_numbers()
    prop.extend(cc.get_array('CS')[numb==sp])
y = np.array(prop)
mapping = get_sp_mapping(frames,sp)

In [None]:
delta = np.std(y)
zeta = 2
jitter = 1e-8

soap_params = dict(rc=5, nmax=12, lmax=9, awidth=0.4,
                    cutoff_transition_width=0.5,
                    cutoff_dexp=0, cutoff_scale=1,cutoff_rate=1,
                    centerweight=1.,global_species=global_species,nocenters=nocenters,
                   fast_avg=False,is_sparse=False,disable_pbar=True)

kernel = KernelPower(zeta,delta)
trainer = TrainerCholesky(memory_efficient=True)
representation = RawSoapQUIP(**soap_params)
krr = KRR(jitter,trainer)


In [None]:
rawsoaps = representation.transform(frames)
K = kernel.transform(rawsoaps)

In [None]:
lc = LCSplit(EnvironmentalShuffleSplit, n_repeats=[5,5],train_sizes=[5,10],test_size=10, random_state=10,mapping=mapping)


In [None]:
score = []
for train,test in tqdm_notebook(lc.split(rawsoaps),total=lc.n_splits):
    k_train = K[np.ix_(train,train)]
    y_train = y[train]
    k_test = K[np.ix_(test,train)]
    krr.fit(k_train,y_train)
    y_pred = krr.predict(k_test)
    score.append(get_score(y_pred,y[test]))

In [None]:
35000**2*8/1e9

# Procedure of the paper (updated)

## detect outlier in the train set

In [None]:
sp = 1
path = './'
xyzPath = path + ''
fn = xyzPath + 'CSD500.xyz'
frames_train = read(fn,index=':50')
global_species=[1, 6, 7, 8]
nocenters = copy(global_species)
nocenters.remove(sp)
prop = []
for cc in frames_train:
    numb = cc.get_atomic_numbers()
    prop.extend(cc.get_array('CS')[numb==sp])
y_train = np.array(prop)


In [None]:
delta = np.std(y_train)*0.1
zeta = 2
jitter = 1e-8

soap_params = dict(rc=3.5, nmax=9, lmax=6, awidth=0.4,
                    cutoff_transition_width=0.5,
                    cutoff_dexp=0, cutoff_scale=1,cutoff_rate=1,
                    centerweight=1.,global_species=global_species,nocenters=nocenters,
                   fast_avg=False,is_sparse=False,disable_pbar=True)

kernel = KernelPower(zeta,delta)
representation = RawSoapQUIP(**soap_params)
filt = SymmetryFilter(threshold=1e-4,species=[sp])


In [None]:
rawsoaps = representation.transform(frames_train)

X_filt, y_filt = filt.fit_transform(dict(frames=frames_train,feature_matrix=rawsoaps),y=y_train)
Kmat = kernel.transform(X_filt)

In [None]:
mapping = {}
for iframe,(st,nd) in enumerate(zip(filt.strides[:-1],filt.strides[1:])):
    mapping[iframe] = range(st,nd)

In [None]:
np.random.seed(10)
seeds = np.random.randint(low=0,high=1000,size=(100,))
preds = []
for seed in tqdm_notebook(seeds):
    cv = EnvironmentalKFold(n_splits=3,random_state=seed,shuffle=True,mapping=mapping)
    krr = KRRFastCV(jitter,cv)
    
    krr.fit(Kmat,y_filt)
    
    yp_filt = krr.predict()
    _,y_pred = filt.inverse_transform(y=yp_filt)
    preds.append(y_pred)
preds = np.array(preds)

In [None]:
m = np.mean(preds-y_train,axis=0)
s = np.std(preds-y_train,axis=0)

In [None]:
plt.plot(s,'o',ms=2,)
plt.plot(3*s.mean()*np.ones(len(m)),'-',ms=2,)

In [None]:
plt.plot(y_train,'o',ms=2)
plt.plot(np.where(s>3*s.mean())[0],y_train[s>3*s.mean()],'ro',ms=2,)

In [None]:
mapping_sp = get_sp_mapping(frames_train,sp)

iii = np.where(s>3*s.mean())[0]
suspicious_center = iii
sp,name=1,'ccsd'
suspicious_frame = []
for icenter in iii:
    for iframe,centers in mapping_sp.iteritems():
        if icenter in centers:
            suspicious_frame.append(iframe)
            break

#print np.unique(suspicious_frame)
suspicious_frame = np.unique(suspicious_frame)
suspicious_frame

## FPS filtering

In [None]:
sp = 1
path = './'
xyzPath = path + ''
fn = xyzPath + 'CSD500.xyz'
frames_train = read(fn,index=':50')
global_species=[1, 6, 7, 8]
nocenters = copy(global_species)
nocenters.remove(sp)

In [None]:
suspicious_frames = [18]
for idx in suspicious_frames:
    frames_train.pop(idx)
print len(frames_train)

prop = []
for cc in frames_train:
    numb = cc.get_atomic_numbers()
    prop.extend(cc.get_array('CS')[numb==sp])
y_train = np.array(prop)

In [None]:
delta = np.std(y_train)
zeta = 2
Nselect = 6

soap_params = dict(rc=5, nmax=12, lmax=9, awidth=0.4,
                    cutoff_transition_width=0.5,
                    cutoff_dexp=0, cutoff_scale=1,cutoff_rate=1,
                    centerweight=1.,global_species=global_species,nocenters=nocenters,
                   fast_avg=False,is_sparse=False,disable_pbar=True)

kernel = KernelPower(zeta,delta)
representation = RawSoapQUIP(**soap_params)
compressor = FPSFilter(Nselect,kernel,act_on='sample',precompute_kernel=True,disable_pbar=True)
filt = SymmetryFilter(threshold=1e-4,species=[sp])

In [None]:
rawsoaps = representation.transform(frames_train)

X_filt, y_filt = filt.fit_transform(dict(frames=frames_train,feature_matrix=rawsoaps),y=y_train)


In [None]:
compressor.fit(X_filt,dry_run=True)
compressor.plot()

## CV score for a model

In [None]:
sp = 1
path = './'
xyzPath = path + ''
fn = xyzPath + 'CSD500.xyz'
frames_train = read(fn,index=':50')
global_species=[1, 6, 7, 8]
nocenters = copy(global_species)
nocenters.remove(sp)

In [None]:
suspicious_frames = [18]
for idx in suspicious_frames:
    frames_train.pop(idx)
print len(frames_train)

In [None]:
prop = []
for cc in frames_train:
    numb = cc.get_atomic_numbers()
    prop.extend(cc.get_array('CS')[numb==sp])
y_train = np.array(prop)

In [None]:
delta = np.std(y_train)*0.1
zeta = 2
jitter = 1e-8
Nselect = 1000

soap_params = dict(rc=3.5, nmax=9, lmax=6, awidth=0.4,
                    cutoff_transition_width=0.5,
                    cutoff_dexp=0, cutoff_scale=1,cutoff_rate=1,
                    centerweight=1.,global_species=global_species,nocenters=nocenters,
                   fast_avg=False,is_sparse=False,disable_pbar=True)

kernel = KernelPower(zeta)
representation = RawSoapQUIP(**soap_params)
filt = SymmetryFilter(threshold=1e-4,species=[sp])
compressor = FPSFilter(Nselect,kernel,act_on='sample',precompute_kernel=True,disable_pbar=True)


In [None]:
rawsoaps = representation.transform(frames_train)

X_filt, y_filt = filt.fit_transform(dict(frames=frames_train,feature_matrix=rawsoaps),y=y_train)
Kmat = kernel.transform(X_filt)

In [None]:
mapping = {}
for iframe,(st,nd) in enumerate(zip(filt.strides[:-1],filt.strides[1:])):
    mapping[iframe] = range(st,nd)

In [None]:
cv = EnvironmentalKFold(n_splits=10,random_state=seed,shuffle=True,mapping=mapping)
krr = KRRFastCV(jitter,delta,cv)

In [None]:

krr.fit(Kmat,y_filt)

yp_filt = krr.predict()
_,y_pred = filt.inverse_transform(y=yp_filt)

get_score(y_train,y_pred)

In [None]:
rawsoaps = representation.transform(frames_test)

Ktest = kernel.transform(X_filt,rawsoaps)

In [None]:
trainer = TrainerCholesky(memory_efficient=True)
krr = KRR(jitter,delta,trainer)
krr.fit(Kmat,y)
y_pred = krr.prediction(Ktest)
