# Load data

In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, Lasso, LinearRegression, BayesianRidge
import warnings
warnings.filterwarnings("ignore")
from mmltoolkit.CV_tools import *
from mmltoolkit.featurizations import * 
from mmltoolkit.fingerprints import * 

#Read the data
data = pd.read_excel('../datasets/Huang_Massa_data_with_all_SMILES.xlsx', skipfooter=1)

target_prop = 'Explosive energy (kj/cc)'

#Add some new columns
data['Mols'] = data['SMILES'].apply(Chem.MolFromSmiles)


#important - add hydrogens!!
data['Mols'] = data['Mols'].apply(Chem.AddHs)


X_Estate = truncated_Estate_featurizer(list(data['Mols']))


num_mols = len(data)

targets = [
 'Density (g/cm3)',
 'Delta Hf solid (kj/mol)',
 'Explosive energy (kj/cc)',
 'Shock velocity (km/s)',
 'Particle velocity (km/s)',
 'Speed of sound (km/s)',
 'Pressure (Gpa)',
 'T(K)',
 'TNT Equiv (per cc)'
  ]




In [2]:
data.columns

Index(['num', 'Molecular Name', 'Formula', 'Density (g/cm3)',
       'Delta Hf solid (kj/mol)', 'Explosive energy (kj/cc)',
       'Shock velocity (km/s)', 'Particle velocity (km/s)',
       'Speed of sound (km/s)', 'Pressure (Gpa)', 'T(K)', 'TNT Equiv (per cc)',
       'group', 'SMILES', 'NOTE', 'Mols'],
      dtype='object')

# Generate featurizations

In [3]:
from mmltoolkit.descriptors import *
from mmltoolkit.featurizations import *


bond_types, X_LBoB = literal_bag_of_bonds(list(data['Mols'])) 

num_atoms = []
for mol in data['Mols']:
    mol = Chem.AddHs(mol)
    num_atoms += [mol.GetNumAtoms()]
    
max_atoms = int(max(num_atoms))

X_Cmat_as_vec = np.zeros((num_mols, (max_atoms**2-max_atoms)//2 + max_atoms))
X_Cmat_eigs = np.zeros((num_mols, max_atoms))
X_Cmat_unsorted_eigs = np.zeros((num_mols, max_atoms))

X_summedBoB = []
filename_list = []

for i, refcode in enumerate(data['Molecular Name']):
    filename = '../HM_all_xyz_files/'+refcode+'.xyz'
    this_Cmat_eigs, this_Cmat_as_vec = coulombmat_and_eigenvalues_as_vec(filename, max_atoms )
    this_Cmat_unsorted_eigs, this_Cmat_as_vec = coulombmat_and_eigenvalues_as_vec(filename, max_atoms, sort=False)

    summed_BoB_feature_names, summedBoB = summed_bag_of_bonds(filename)
    X_summedBoB += [summedBoB]

    filename_list += [filename]
    
    X_Cmat_eigs[i,:] = this_Cmat_eigs
    X_Cmat_unsorted_eigs[i,:] = this_Cmat_eigs
    X_Cmat_as_vec[i,:] = this_Cmat_as_vec

X_summedBoB = np.array(X_summedBoB)

BoB_feature_list, X_BoB = bag_of_bonds(filename_list, verbose=False)


data['Oxygen Balance_100'] = data['Mols'].apply(oxygen_balance_100)
data['Oxygen Balance_1600'] = data['Mols'].apply(oxygen_balance_1600)

data['modified OB'] = data['Mols'].apply(modified_oxy_balance)
data['OB atom counts'] = data['Mols'].apply(return_atom_nums_modified_OB)
data['combined_nums'] =  data['Mols'].apply(return_combined_nums)


X_OB100 = np.array(list(data['Oxygen Balance_100'])).reshape(-1,1)     
X_OB1600 = np.array(list(data['Oxygen Balance_1600'])).reshape(-1,1)     
X_OBmod = np.array(list(data['modified OB'])).reshape(-1,1)   
X_OB_atom_counts = np.array(list(data['OB atom counts']))
X_combined = np.array(list(data['combined_nums']))

X_Estate_combined = np.concatenate((X_Estate, X_combined), axis=1)
X_Estate_combined_Cmat_eigs = np.concatenate((X_Estate_combined, X_Cmat_eigs), axis=1)
X_Estate_combined_lit_BoB = Estate_CDS_LBoB_featurizer(list(data['Mols']))
X_CustDesrip_lit_BoB = np.concatenate(( X_combined, X_LBoB), axis=1)

                             


In [4]:
featurization_dict = {
                 "Estate": X_Estate,
                 "Oxygen balance$_{100}$": X_OB100, 
                 "Oxygen balance$_{1600}$": X_OB1600, 
                 "Oxygen balance atom counts": X_OB_atom_counts,
                 "CDS": X_combined,
                 "SoB" : X_LBoB,
                 'Estate+CDS':   X_Estate_combined,
                 "Coulomb matrices as vec" :   X_Cmat_as_vec,
                 "CM eigs": X_Cmat_eigs,
                 "Bag of Bonds": X_BoB,
                 "Summed Bag of Bonds (sBoB)": X_summedBoB, 
                 "\\footnotesize{Estate+CDS+SoB}":X_Estate_combined_lit_BoB,
                 "C.D.S + LBoB": X_CustDesrip_lit_BoB,
                 "LBoB + OB100": np.concatenate(( X_LBoB, X_OB100), axis=1)
                }

targets = [
 #'Density (g/cm3)',
 #'Delta Hf solid (kj/mol)',
 'Explosive energy (kj/cc)',
 #'Shock velocity (km/s)',
 #'Particle velocity (km/s)',
 #'Speed of sound (km/s)',
 #'Pressure (Gpa)',
 #'T(K)',
 #'TNT Equiv (per cc)'
  ]


In [None]:
y = np.array(list(data['Explosive energy (kj/cc)']))


# Run test_everything within notebook 

In [None]:
from mmltoolkit.test_everything import * 
from sklearn.model_selection import ShuffleSplit 

(results, best) = test_everything(data, featurization_dict, targets, verbose=True, normalize=True )



running target Explosive energy (kj/cc)
    testing featurization C.D.S + LBoB
doing outer fold 1 of 20
best params:  {'gamma': 0.00031992671377973844, 'kernel_params': None, 'degree': 3, 'alpha': 0.0053535666774107192, 'coef0': 1, 'kernel': 'rbf'}
doing outer fold 2 of 20
best params:  {'gamma': 0.0014508287784959432, 'kernel_params': None, 'degree': 3, 'alpha': 0.021214517849106277, 'coef0': 1, 'kernel': 'rbf'}
doing outer fold 3 of 20


# Save to pickle file

In [None]:
#import pickle
#pickle.dump( results, open( "test_all_results3.pkl", "wb" ) )
#pickle.dump( best, open( "test_all_best3.pkl", "wb" ) )

# Load test_everything results from .pkl 

In [38]:
import pickle

results = pickle.load(open( "all_nested_results_KRR_SoB.pkl", "rb" ))
best = pickle.load( open( "all_nested_best_KRR_SoB.pkl", "rb" ) ) 


In [39]:
from pprint import PrettyPrinter

pp = PrettyPrinter()

pp.pprint(results)

{'Delta Hf solid (kj/mol)': {'SoB': {'KRR': {'MAE': 68.72948215026273,
                                             'MAE_std': 16.250305057909,
                                             'MAE_std_train': 10.962421710864264,
                                             'MAE_train': 24.165554185197937,
                                             'MAPE': 64.23294590510596,
                                             'R2': 0.9301394795866109,
                                             'R2_train': 0.9932014794627397,
                                             'RMSE': 100.33310335228752,
                                             'rP': 0.9388849369750949,
                                             'rP_train': 0.9932330058506867}}},
 'Density (g/cm3)': {'SoB': {'KRR': {'MAE': 0.0653346975387568,
                                     'MAE_std': 0.011681024736317469,
                                     'MAE_std_train': 0.00910102184354365,
                                     'MAE_t

# print table of from results dictionary 

In [40]:
%load_ext autoreload
%autoreload 2
from mmltoolkit.test_everything import print_everything
targets = [
 'Density (g/cm3)',
 'Delta Hf solid (kj/mol)',
 'Explosive energy (kj/cc)',
 'Shock velocity (km/s)',
 'Particle velocity (km/s)',
 'Speed of sound (km/s)',
 'Pressure (Gpa)',
 'T(K)',
 'TNT Equiv (per cc)'
  ]
print_everything(results, best, targets)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
\begin{table*}[ht]
\begin{tabular}{ccccccccccc}
 &  & \footnotesize{$\rho ,\frac{\hbox{g}}{\hbox{cc}}$ } & \footnotesize{$\Delta H_f^{\ff{s}} ,\frac{\hbox{kJ}}{\hbox{mol}}$ } & \footnotesize{$E_{\ff{e}} ,\frac{\hbox{kJ}}{\hbox{cc}}$ } & \footnotesize{$V_{\ff{s}} ,\frac{\hbox{km}}{\hbox{s}}$ } & \footnotesize{$V_{\ff{p}},\frac{\hbox{km}}{\hbox{s}}$ } & \footnotesize{$V_{\ff{snd}},\frac{\hbox{km}}{\hbox{s}}$ } & \footnotesize{$P$, GPa} & \footnotesize{$T$, K} & \footnotesize{$\frac{\hbox{TNT}_{\ff{equiv}}}{\hbox{cc}}$ } \\
\hline
KRR & SoB & \bf{ 0.07} & \bf{68.73} & \bf{ 0.40} & \bf{ 0.31} & \bf{ 0.09} & \bf{ 0.25} & \bf{ 2.90} & \bf{331.36} & \bf{ 0.11}\\
\end{tabular}
\end{table*}


# Print table with standard deviation values 

In [34]:
 

target_short_names = {
 'Density (g/cm3)':'\\footnotesize{$\\rho ,\\frac{\\hbox{g}}{\\hbox{cc}}$ }',
 'Delta Hf solid (kj/mol)': '\\footnotesize{$\Delta H_f^{\\ff{s}} ,\\frac{\\hbox{kJ}}{\\hbox{mol}}$ }',
 'Explosive energy (kj/cc)': '\\footnotesize{$E_{\\ff{e}} ,\\frac{\\hbox{kJ}}{\\hbox{cc}}$ }',
 'Shock velocity (km/s)': '\\footnotesize{$V_{\\ff{s}} ,\\frac{\\hbox{km}}{\\hbox{s}}$ }',
 'Particle velocity (km/s)': '\\footnotesize{$V_{\\ff{p}},\\frac{\\hbox{km}}{\\hbox{s}}$ }',
 'Speed of sound (km/s)': '\\footnotesize{$V_{\\ff{snd}},\\frac{\\hbox{km}}{\\hbox{s}}$ }',
 'Pressure (Gpa)': '\\footnotesize{$P$, GPa}',
 'T(K)': '\\footnotesize{$T$, K}',
 'TNT Equiv (per cc)': '\\footnotesize{$\\frac{\\hbox{TNT}_{\\ff{equiv}}}{\\hbox{cc}}$ }' 
}



print("\\begin{table*}")
print("\\begin{tabular}{cc",end='')
for l in range(len(targets)):
      print("c",end='')
print("}")
print(" & ",end='')
for target in targets:
    print(" & "+target_short_names[target], end='')
print(" \\\\")
print("\\hline")
featurizations = list(results[targets[0]].keys())
models = list(results[targets[0]][featurizations[0]].keys())
for model in models:
    for (i, featurization) in enumerate(featurizations):
        if(i == 0):
            print(model+" & ", end='')
        else:
            print(" & ", end='')
        print(featurization+" & ", end='')
        for (j, target) in enumerate(targets): 
            scores_dict = results[target][featurization][model]
            #print(" %5.2f, %4.2f  " % (scores_dict['MAPE'], scores_dict['r2']), end='')
            #print(" %5.2f " % (scores_dict['MAPE']), end='')
            #print("%4.2f" % (scores_dict['r2']), end='')

            if ([featurization, model] == best[target]):
                print("\\bf{%5.2f}$^{%4.2f}$" % (scores_dict['MAE'], scores_dict['MAE_std']), end='')
            else:
                print("%5.2f$^{%4.2f}$" % (scores_dict['MAE'], scores_dict['MAE_std']), end='')
            
            if (j == len(targets)-1):
                print("\\\\")
            else:
                print(" & ", end='')

        
print("\\end{tabular}")
print("\\end{table*}")

\begin{table*}
\begin{tabular}{ccccccccccc}
 &  & \footnotesize{$\rho ,\frac{\hbox{g}}{\hbox{cc}}$ } & \footnotesize{$\Delta H_f^{\ff{s}} ,\frac{\hbox{kJ}}{\hbox{mol}}$ } & \footnotesize{$E_{\ff{e}} ,\frac{\hbox{kJ}}{\hbox{cc}}$ } & \footnotesize{$V_{\ff{s}} ,\frac{\hbox{km}}{\hbox{s}}$ } & \footnotesize{$V_{\ff{p}},\frac{\hbox{km}}{\hbox{s}}$ } & \footnotesize{$V_{\ff{snd}},\frac{\hbox{km}}{\hbox{s}}$ } & \footnotesize{$P$, GPa} & \footnotesize{$T$, K} & \footnotesize{$\frac{\hbox{TNT}_{\ff{equiv}}}{\hbox{cc}}$ } \\
\hline
KRR & Estate &  0.10$^{0.02}$ & 261.02$^{65.62}$ &  0.63$^{0.08}$ &  0.48$^{0.11}$ &  0.13$^{0.03}$ &  0.41$^{0.08}$ &  4.95$^{0.76}$ & 500.19$^{88.25}$ &  0.18$^{0.03}$\\
 & CDS &  0.08$^{0.02}$ & 198.81$^{27.18}$ &  0.50$^{0.10}$ &  0.44$^{0.10}$ &  0.11$^{0.02}$ &  0.37$^{0.08}$ &  3.07$^{0.65}$ & 462.63$^{106.87}$ &  0.17$^{0.04}$\\
 & CM eigs &  0.09$^{0.03}$ & 288.41$^{55.58}$ &  0.67$^{0.17}$ &  0.67$^{0.14}$ &  0.18$^{0.04}$ &  0.61$^{0.19}$ &  5.67$^{0.90}$

# Print table with confidence intervals from t-values

In [32]:
from scipy.stats import t
from numpy import average, std
from math import sqrt

 

target_short_names = {
 'Density (g/cm3)':'\\footnotesize{$\\rho ,\\frac{\\hbox{g}}{\\hbox{cc}}$ }',
 'Delta Hf solid (kj/mol)': '\\footnotesize{$\Delta H_f^{\\ff{s}} ,\\frac{\\hbox{kJ}}{\\hbox{mol}}$ }',
 'Explosive energy (kj/cc)': '\\footnotesize{$E_{\\ff{e}} ,\\frac{\\hbox{kJ}}{\\hbox{cc}}$ }',
 'Shock velocity (km/s)': '\\footnotesize{$V_{\\ff{s}} ,\\frac{\\hbox{km}}{\\hbox{s}}$ }',
 'Particle velocity (km/s)': '\\footnotesize{$V_{\\ff{p}},\\frac{\\hbox{km}}{\\hbox{s}}$ }',
 'Speed of sound (km/s)': '\\footnotesize{$V_{\\ff{snd}},\\frac{\\hbox{km}}{\\hbox{s}}$ }',
 'Pressure (Gpa)': '\\footnotesize{$P$, GPa}',
 'T(K)': '\\footnotesize{$T$, K}',
 'TNT Equiv (per cc)': '\\footnotesize{$\\frac{\\hbox{TNT}_{\\ff{equiv}}}{\\hbox{cc}}$ }' 
}




num_DOF = 19
 
t_bounds = t.interval(0.95, num_DOF)
t_value = t_bounds[1]   
    

print("\\begin{table*}")
print("\\begin{tabular}{cc",end='')
for l in range(len(targets)):
      print("c",end='')
print("}")
print(" & ",end='')
for target in targets:
    print(" & "+target_short_names[target], end='')
print(" \\\\")
print("\\hline")
featurizations = list(results[targets[0]].keys())
models = list(results[targets[0]][featurizations[0]].keys())
for model in models:
    for (i, featurization) in enumerate(featurizations):
        if(i == 0):
            print(model+" & ", end='')
        else:
            print(" & ", end='')
        print(featurization+" & ", end='')
        for (j, target) in enumerate(targets): 
            scores_dict = results[target][featurization][model]
            #print(" %5.2f, %4.2f  " % (scores_dict['MAPE'], scores_dict['r2']), end='')
            #print(" %5.2f " % (scores_dict['MAPE']), end='')
            #print("%4.2f" % (scores_dict['r2']), end='')
            mean = scores_dict['MAE']
            stddev = scores_dict['MAE_std']
            upper = mean + t_value*stddev/sqrt(num_DOF)
            lower = mean - t_value*stddev/sqrt(num_DOF)

            if ([featurization, model] == best[target]):
                print("\\bf{%5.2f}$^{%4.2f}_{%4.2f}$" % (scores_dict['MAE'], upper, lower), end='')
            else:
                print("%5.2f$^{%4.2f}_{%4.2f}$" % (scores_dict['MAE'], upper, lower), end='')
            
            if (j == len(targets)-1):
                print("\\\\")
            else:
                print(" & ", end='')

        
print("\\end{tabular}")
print("\\end{table*}")

\begin{table*}
\begin{tabular}{ccccccccccc}
 &  & \footnotesize{$\rho ,\frac{\hbox{g}}{\hbox{cc}}$ } & \footnotesize{$\Delta H_f^{\ff{s}} ,\frac{\hbox{kJ}}{\hbox{mol}}$ } & \footnotesize{$E_{\ff{e}} ,\frac{\hbox{kJ}}{\hbox{cc}}$ } & \footnotesize{$V_{\ff{s}} ,\frac{\hbox{km}}{\hbox{s}}$ } & \footnotesize{$V_{\ff{p}},\frac{\hbox{km}}{\hbox{s}}$ } & \footnotesize{$V_{\ff{snd}},\frac{\hbox{km}}{\hbox{s}}$ } & \footnotesize{$P$, GPa} & \footnotesize{$T$, K} & \footnotesize{$\frac{\hbox{TNT}_{\ff{equiv}}}{\hbox{cc}}$ } \\
\hline
SVR & Estate &  0.09$^{0.10}_{0.09}$ & 207.78$^{234.27}_{181.30}$ &  0.60$^{0.64}_{0.57}$ &  0.45$^{0.50}_{0.41}$ &  0.13$^{0.14}_{0.11}$ &  0.35$^{0.37}_{0.33}$ &  4.41$^{4.81}_{4.00}$ & 476.06$^{518.97}_{433.15}$ &  0.17$^{0.18}_{0.16}$\\
 & CDS &  0.07$^{0.08}_{0.07}$ & 223.24$^{257.71}_{188.77}$ &  0.52$^{0.56}_{0.49}$ &  0.34$^{0.37}_{0.31}$ &  0.12$^{0.13}_{0.11}$ &  0.32$^{0.35}_{0.29}$ &  3.21$^{3.42}_{3.01}$ & 436.81$^{475.64}_{397.99}$ &  0.18$^{0.19}_{0.1