In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
import numpy as np
import ase
from ase.io import read,write
from ase.visualize import view
import sys,os
from glob import glob
from copy import copy
from tqdm import tqdm_notebook
import cPickle as pck

In [3]:
sys.path.insert(0,'/local/git/ml_tools/')

In [4]:
import quippy as qp

In [5]:
import sh

In [6]:
def get_sp_mapping(frames,sp):
    ii = 0
    fid2gids = {it:[] for it in range(len(frames))}
    for iframe,cc in enumerate(frames):
        for ss in cc.get_atomic_numbers():
            if ss == sp:
                fid2gids[iframe].append(ii)
                ii += 1
    return fid2gids


In [7]:
from ml_tools.descriptors.quippy_interface import RawSoapQUIP
from ml_tools.models.KRR import KRR,TrainerCholesky
from ml_tools.models.pipelines import RegressorPipeline
from ml_tools.models.handlers import HashJsonHandler
from ml_tools.kernels.kernels import KernelPower
from ml_tools.io_utils import load_pck
from ml_tools.utils import get_mae,get_rmse,get_sup,get_spearman,get_score
from ml_tools.split.spliter import KFold,EnvironmentalKFold,LCSplit,ShuffleSplit,EnvironmentalShuffleSplit
from ml_tools.model_selection.scorer import CrossValidationScorer
from ml_tools.model_selection.gs import GridSearch
from ml_tools.base import KernelBase
from ml_tools.math_utils.optimized import power

# train and predict

In [22]:
sp = 1
path = './'
xyzPath = path + ''
fn = xyzPath + 'CSD500.xyz'
frames_train = read(fn,index=':10')
global_species=[1, 6, 7, 8]
nocenters = copy(global_species)
nocenters.remove(sp)
prop = []
for cc in frames_train:
    numb = cc.get_atomic_numbers()
    prop.extend(cc.get_array('CS')[numb==sp])
y_train = np.array(prop)

In [23]:
delta = np.std(y)
zeta = 2
jitter = 1e-8

soap_params = dict(rc=5, nmax=12, lmax=9, awidth=0.4,
                    cutoff_transition_width=0.5,
                    cutoff_dexp=0, cutoff_scale=1,cutoff_rate=1,
                    centerweight=1.,global_species=global_species,nocenters=nocenters,
                   fast_avg=False,is_sparse=False,disable_pbar=True)

kernel = KernelPower(zeta,delta)
trainer = TrainerCholesky(memory_efficient=True)
representation = RawSoapQUIP(**soap_params)
krr = KRR(jitter,trainer)

In [24]:
rawsoaps = representation.transform(frames_train)
kk = kernel.transform(rawsoaps)
krr.fit(kk,y_train)

In [25]:
frames_test = read(fn,index='10:20')
prop = []
for cc in frames_test:
    numb = cc.get_atomic_numbers()
    prop.extend(cc.get_array('CS')[numb==sp])
y_true = np.array(prop)

In [27]:
rawsoaps_test = representation.transform(frames_test)
kk_test = kernel.transform(rawsoaps_test,X_train=rawsoaps)
y_pred = krr.predict(kk_test)
get_score(y_pred,y_true)

(1.8079299206273776,
 2.4277177081353867,
 7.322334419945037,
 0.4631398849450846,
 0.7504786883833238)

# CV

In [35]:
sp = 1
path = './'
xyzPath = path + ''
fn = xyzPath + 'CSD500.xyz'
frames = read(fn,index=':10')
global_species=[1, 6, 7, 8]
nocenters = copy(global_species)
nocenters.remove(sp)
prop = []
for cc in frames:
    numb = cc.get_atomic_numbers()
    prop.extend(cc.get_array('CS')[numb==sp])
y = np.array(prop)
mapping = get_sp_mapping(frames,sp)

In [36]:
delta = np.std(y)
zeta = 2
jitter = 1e-8

soap_params = dict(rc=5, nmax=12, lmax=9, awidth=0.4,
                    cutoff_transition_width=0.5,
                    cutoff_dexp=0, cutoff_scale=1,cutoff_rate=1,
                    centerweight=1.,global_species=global_species,nocenters=nocenters,
                   fast_avg=False,is_sparse=False,disable_pbar=True)

kernel = KernelPower(zeta,delta)
trainer = TrainerCholesky(memory_efficient=True)
representation = RawSoapQUIP(**soap_params)
krr = KRR(jitter,trainer)

cv = EnvironmentalKFold(n_splits=6,random_state=10,shuffle=True,mapping=mapping)


In [37]:
rawsoaps = representation.transform(frames)

In [40]:

def compute_cross_validation_error(rawsoaps,y,kernel,jitter,cv):
    K = kernel.transform(rawsoaps)
    Q = K + np.diag(jitter*np.ones(K.shape[0]))
    Q_inv = np.linalg.inv(Q)
    alpha = np.dot(Q_inv,y)
    Cii = []
    beta = np.zeros(alpha.shape)
    error = np.zeros(y.shape)
    for train,test in cv.split(rawsoaps):
        Cii = Q_inv[np.ix_(test,test)]
        beta = np.linalg.solve(Cii,alpha[test])
        error[test] = beta # beta = y_true - y_pred 
    return error

In [43]:
error = compute_cross_validation_error(rawsoaps,y,kernel,jitter,cv)
print np.sqrt(np.mean(error**2))
print error


1.975558350171262
[-5.38835793e-01 -5.44934236e-01 -5.38827481e-01 -5.44942394e-01
  1.19339224e+00  1.17967875e+00  1.19341209e+00  1.17966460e+00
 -6.77844718e-01 -6.85082210e-01 -6.77843657e-01 -6.85076229e-01
  7.32893975e-01  7.38643107e-01  7.32909128e-01  7.38648855e-01
  9.67222176e-01  9.63415471e-01  9.67225483e-01  9.63418982e-01
  1.44494831e+00  1.44281050e+00  1.44496489e+00  1.44281265e+00
 -4.01776629e-01 -4.31315976e-01 -4.01763944e-01 -4.31289025e-01
 -1.52790804e+00 -1.53104061e+00 -1.52791281e+00 -1.53105148e+00
 -2.47070846e+00 -2.47447121e+00 -2.47072362e+00 -2.47448512e+00
  1.26198601e+00  1.26653595e+00  1.26199495e+00  1.26653194e+00
 -1.44213284e+00 -1.44081971e+00 -1.44212790e+00 -1.44081305e+00
 -1.91712728e+00 -1.91781939e+00 -1.91712609e+00 -1.91782232e+00
  2.58334975e-01  2.45891594e-01  2.58329186e-01  2.45887107e-01
 -9.43765082e-01 -9.42280688e-01 -9.43761392e-01 -9.42282589e-01
 -1.67182781e+00 -1.68471648e+00 -1.67182975e+00 -1.68471589e+00
 -2.63

# LC

In [8]:
sp = 1
path = './'
xyzPath = path + ''
fn = xyzPath + 'CSD500.xyz'
frames = read(fn,index=':20')
global_species=[1, 6, 7, 8]
nocenters = copy(global_species)
nocenters.remove(sp)
prop = []
for cc in frames:
    numb = cc.get_atomic_numbers()
    prop.extend(cc.get_array('CS')[numb==sp])
y = np.array(prop)
mapping = get_sp_mapping(frames,sp)

In [9]:
delta = np.std(y)
zeta = 2
jitter = 1e-8

soap_params = dict(rc=5, nmax=12, lmax=9, awidth=0.4,
                    cutoff_transition_width=0.5,
                    cutoff_dexp=0, cutoff_scale=1,cutoff_rate=1,
                    centerweight=1.,global_species=global_species,nocenters=nocenters,
                   fast_avg=False,is_sparse=False,disable_pbar=True)

kernel = KernelPower(zeta,delta)
trainer = TrainerCholesky(memory_efficient=True)
representation = RawSoapQUIP(**soap_params)
krr = KRR(jitter,trainer)


In [10]:
rawsoaps = representation.transform(frames)
K = kernel.transform(rawsoaps)

In [11]:
lc = LCSplit(EnvironmentalShuffleSplit, n_repeats=[5,5],train_sizes=[5,10],test_size=10, random_state=10,mapping=mapping)


In [12]:
score = []
for train,test in tqdm_notebook(lc.split(rawsoaps),total=lc.n_splits):
    k_train = K[np.ix_(train,train)]
    y_train = y[train]
    k_test = K[np.ix_(test,train)]
    krr.fit(k_train,y_train)
    y_pred = krr.predict(k_test)
    score.append(get_score(y_pred,y[test]))




In [13]:
score

[(1.5040761150245494,
  2.0170012924798995,
  7.1407129556284055,
  0.6238729855304694,
  0.8351685950959058),
 (1.8873528946165328,
  2.6422783537895853,
  10.284922030440192,
  0.449863735634615,
  0.7045435866250319),
 (2.370721459963461,
  3.1508081966117762,
  10.25660212709925,
  0.21106256897226416,
  0.6581540077884626),
 (1.8132095878687884,
  2.4344083428252534,
  9.77793574686686,
  0.130078274393702,
  0.7058873155215595),
 (2.0395784125758576,
  2.6574837168083176,
  8.171710137547052,
  0.07577558422537345,
  0.732689786229516),
 (1.5609877835110493,
  2.153401763043191,
  6.506300750486552,
  0.6065280691052959,
  0.8280476400320381),
 (1.552557466697036,
  2.272520953277722,
  7.987756671943718,
  0.5083343135812903,
  0.7985794887599824),
 (1.4780148054543294,
  1.9712179054388586,
  8.414909720681607,
  0.6523318257472059,
  0.8384631430925863),
 (1.7242058401302203,
  2.234832691875089,
  7.342592677637004,
  0.2554672312337254,
  0.771131928716742),
 (1.418823593694