In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.model_selection import train_test_split
import sys
sys.path.append("../Libs")
from Repres_utils import find_path,find_all_paths,distmat,bm_to_graph,append_dict,build_i_idx,integrity\
        ,angle_cos,dihedral_cos

from sklearn.metrics import mean_absolute_error as MAE

Notebook for the prediction of the Hessian elements corresponding to the second derivative of the energy with respect to a dihedral defined by atoms $i-j-k-l$ and the bonds between the inner atoms of the dihedral ($j-k$), or the outer ($i-j$ or $k-l$) .  

<img src="Figures/DBinternal.png" width="200" height="200" />
<img src="Figures/DBexternal.png" width="200" height="200" />


In [None]:
data = np.load("../Read Data/DataSet_H_IC.npz", allow_pickle=True)
X_IC,Y_IC = data["x"], data["y"]
Data=np.vstack((X_IC.T,Y_IC)).T
Data.shape

In [None]:
from Nondiag_representation import dihedral_bond_core,dihedral_bond_arms,build_DB_repr#(charges,xyzcoords,BOM,idx,q,b)

In [None]:
def add_repr2mols(calcs):
    Mols=[]
    for calc in calcs: 
        charges,xyzcoords,BOM,idxs,q,B,g_ic,h_ic=calc
        Mol=[]
        molg=bm_to_graph(BOM)
        i_idxs=build_i_idx(idxs)
        for b,idx in enumerate(idxs):  
            if len(idx)==2: pass
            elif len(idx)==3: pass
            elif len(idx)==4: 
                i,j,k,l=idx
                rv=[*dihedral_bond_arms(charges,xyzcoords,BOM,(i,j,k,l),q,b)]
                rv=rv+[*build_DB_repr(charges,xyzcoords,BOM,(i,j,k,l),i_idxs,q,molg,b)]
                rv.append(h_ic[b,i_idxs[(i,j)]])
                Mol.append(["E",tuple(charges[x] for x in (i,j,k,l)),rv])
                rv=[*dihedral_bond_arms(charges,xyzcoords,BOM,(l,k,j,i),q,b)]
                rv=rv+[*build_DB_repr(charges,xyzcoords,BOM,(l,k,j,i),i_idxs,q,molg,b)]
                rv.append(h_ic[b,i_idxs[(k,l)]])
                Mol.append(["E",tuple(charges[x] for x in (l,k,j,i)),rv])
                rv=[*dihedral_bond_core(charges,xyzcoords,BOM,(i,j,k,l),q,b)]
                rv=rv+[*build_DB_repr(charges,xyzcoords,BOM,(i,j,k,l),i_idxs,q,molg,b)]
                if charges[l]>charges[i] or (charges[i]==charges[l] and charges[k]>charges[j]):
                    i,j,k,l=l,k,j,i
                rv.append(h_ic[b,i_idxs[(j,k)]])
                Mol.append(["I",tuple(charges[x] for x in (i,j,k,l)),rv])  
        Mols.append(Mol)
    return Mols

In [None]:
from multiprocessing import Pool
from functools import partial
def multi_process_repr(arr,num_processes = 35):
    chunks=np.array_split(arr,num_processes)
    pool = Pool(processes=num_processes)
    results = pool.map(partial(add_repr2mols),chunks)
    return  [item for list_ in results for item in list_]
Mols=multi_process_repr(Data)


In [None]:
train,test=train_test_split(Mols)
train_external={}
train_internal={}
for mol in train:
    for db in mol:
        if db[0]=="E": append_dict(train_external,db[1],db[2])
        if db[0]=="I": append_dict(train_internal,db[1],db[2])
test_external={}
test_internal={}
for mol in test:
    for db in mol:
        if db[0]=="E": append_dict(test_external,db[1],db[2])
        if db[0]=="I": append_dict(test_internal,db[1],db[2])

for ds in [test_external,test_internal,train_external,train_internal]:
    for key in ds:
        ds[key]=np.array(ds[key])

In [None]:
pred_int={}
for key in test_internal:
    print(key)
    if key not in train_internal:continue
    x_train,y_train,x_test,y_test=train_internal[key][:,:-1],train_internal[key][:,-1],\
                                    test_internal[key][:,:-1],test_internal[key][:,-1]
    if (len(y_test)+len(y_train))<10:  # excluding the pairs which do not occur at least 10 times in the dataset
        continue
    RF=rfr(n_estimators=100,n_jobs=32).fit(x_train,y_train)
    y_pred=RF.predict(x_test)
    pred_int[key]=(y_test,y_pred)
    plt.figure(figsize=(8,8))
    plt.scatter(y_test,y_pred,s=3)
    plt.xlabel("TRUE")
    plt.ylabel("PREDICTED")
    ml,Ml=min(min(y_pred),min(y_test)),max(max(y_pred),max(y_test))
    plt.plot([ml,Ml],[ml,Ml],ls=":",c="k")
    plt.show()

In [None]:
pred_ext={}
for key in test_external:
    print(key)
    if key not in train_external:continue

    x_train,y_train,x_test,y_test=train_external[key][:,:-1],train_external[key][:,-1],\
                                test_external[key][:,:-1],test_external[key][:,-1]

    if (len(y_test)+len(y_train))<10:  # excluding the pairs which do not occur at least 10 times in the dataset
        continue
    RF=rfr(n_estimators=100,n_jobs=32).fit(x_train,y_train)

    y_pred=RF.predict(x_test)
    pred_ext[key]=(y_test,y_pred)
    plt.figure(figsize=(8,8))

    plt.scatter(y_test,y_pred,s=3)

    plt.xlabel("TRUE")
    plt.ylabel("PREDICTED")
    ml,Ml=min(min(y_pred),min(y_test)),max(max(y_pred),max(y_test))
    plt.plot([ml,Ml],[ml,Ml],ls=":",c="k")
    plt.show()

In [None]:
plt.figure(figsize=(8,8))
for key in pred_int:
    (y_pred,y_test)=pred_int[key]
    plt.scatter(y_pred,y_test,s=3,c='C0')
    ml,Ml=min(min(y_pred),min(y_test)),max(max(y_pred),max(y_test))
    plt.plot([ml,Ml],[ml,Ml],ls=":",c="k")
plt.figure(figsize=(8,8))
for key in pred_ext:
    (y_pred,y_test)=pred_ext[key]
    plt.scatter(y_pred,y_test,s=3,c='C0')
    ml,Ml=min(min(y_pred),min(y_test)),max(max(y_pred),max(y_test))
    plt.plot([ml,Ml],[ml,Ml],ls=":",c="k")

In [None]:
All_int={}
All_ext={}
for mol in Mols:
    for D_b in mol:
        if db[0]=="E": append_dict(All_ext,db[1],db[2])
        if db[0]=="I": append_dict(All_int,db[1],db[2])
for bex in All_int:
    All_int[bex]=np.asarray(All_int[bex])
for bex in All_ext:
    All_ext[bex]=np.asarray(All_ext[bex])
Models_int={}
for key in All_int:
    x_train,y_train=All_int[key][:,:-1],All_int[key][:,-1]
    rf = rfr(n_estimators=100,n_jobs=32)
    rf.fit(x_train, y_train)
    rf.n_jobs=1
    Models_int[key]=rf
Models_ext={}
for key in All_ext:
    x_train,y_train=Models_ext[key][:,:-1],Models_ext[key][:,-1]
    rf = rfr(n_estimators=100,n_jobs=32)
    rf.fit(x_train, y_train)
    rf.n_jobs=1
    Models_ext[key]=rf

In [None]:
from joblib import dump as jl_dump
from joblib import load as jl_load
for i in Models_int:
    jl_dump(Models_int[i],"./Saved_Models/DB_core/{}{}{}{}.joblib".format(*i))

In [None]:
for j in Models_ext:
    jl_dump(Models_ext[j],"./Saved_Models/DB_arm/{}{}{}{}.joblib".format(*j))