# Data Analysis for OH<sup>-</sup>(H<sub>2</sub>O)<sub>4</sub> DMC Data

### Install necessary libraries and define constants

In [3]:
import numpy as np
import pyvibdmc as pv
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from scipy.spatial.distance import pdist
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import joblib
import pandas as pd
from nn_dmc import *

import h5py

au_to_cm = 219474.63136320

au_to_ang = 0.529177249

h9o5_atoms = np.array([8,1,1,8,1,1,8,1,1,8,1,1,8,1])
h9o5_groups = [[0,1,2],[3,4,5],[6,7,8],[9,10,11],[12,13]]
h9o5_atom_names = ['O','H','H','O','H','H','O','H','H','O','H','H','O','H']

ml_path = 'C:/Users/gjaco/OneDrive/Documents/UW/McCoy Group/Machine Learning'
data_path = 'C:/Users/gjaco/OneDrive/Documents/UW/McCoy Group/H9O5-/Data'

In [4]:
energy_shift = -83718631.70213751

### Load in training data generated from MOB-ML DMC simulations

In [10]:
isomer = 'c4'
sim_type = 'wide'
sim_num = 1

train_energies_list = []
train_cds_list = []

if sim_type == 'min':
    sim = '500w_1000ts_1dt'
elif sim_type == 'wide':
    sim = '1000w_1000ts_10dt'
elif sim_type == 'reg':
    if sim_num in [1, 2]:
        sim = '200w_5000ts_10dt'
    else:
        sim = '200w_1000ts_10dt'
elif sim_type == 'test':
    sim = '1000w_1000ts_10dt_min_start'

else:
    print("Not a valid sim type")

print(f'starting {isomer} {sim_type} sim number {sim_num}')

path = f'{data_path}/mobml_pyscf/{isomer}_min/sim_{sim_num}/{sim}'

training_sim = pv.SimInfo(f'{path}/h9o5_0_sim_info.hdf5')

for b in range(1000):
    cds,energies = training_sim.get_training(f'{path}/h9o5_0_training_{b}ts.hdf5',ret_ang=False, ret_cm=True)
    
    train_cds_list.append(cds)
    train_energies_list.append(energies-energy_shift) #record energy relative to 4+1 minimum


starting c4 wide sim number 1


FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'C:/Users/gjaco/OneDrive/Documents/UW/McCoy Group/H9O5-/Data/mobml_pyscf/c4_min/sim_1/1000w_1000ts_10dt/h9o5_0_training_995ts.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [11]:
train_energies = np.concatenate(train_energies_list)
train_cds = np.concatenate(train_cds_list)

In [12]:
#check energy range of simulation
print(np.min(train_energies), np.max(train_energies))

559.9969237595797 168873.8371130824


In [13]:
np.save(f'{ml_path}/h9o5/training_data/h9o5_{isomer}_mobml_{sim_type}_sim_{sim_num}_training_cds.npy',train_cds)
np.save(f'{ml_path}/h9o5/training_data/h9o5_{isomer}_mobml_{sim_type}_sim_{sim_num}_training_energies.npy',train_energies)