# Worflow to train the ML model

This workflow performs the following tasks:
- download the substrate structures from materials project
- generate the adsorbate structures (pymatgen)
- generate the adsorption structures
- write the CRYSTAL inputs
- run the adsorbate+substrate CRYSTAL calculations
- run the adsorbate and substrate CRYSTAL calculations (GHOST atoms)
- calculate the E_ads (BSSE corrected)

In [11]:
import sys
sys.path.insert(1, '../../crystal-code-tools/crystal_functions/crystal_functions')

# crystal_functions imports
from file_readwrite import write_cry_input, write_cry_gui
from file_readwrite import Crystal_input, Crystal_output
from adsorb import sub_ads_indices
from calculate import cry_ads_energy
from execute import runcry

# pymatgen imports
from pymatgen.core.structure import Molecule, Structure, Lattice
from pymatgen.core.surface import SlabGenerator, generate_all_slabs
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
from pymatgen.analysis.adsorption import AdsorbateSiteFinder
from pymatgen.ext.matproj import MPRester

# other imports
import shutil
import pandas as pd
import numpy as np

### Make adsorbates
Make the adsorbate structures using the Molecule class in pymatgen

In [12]:
O = Molecule('O',[[0.0, 0.0, 0.0]])
CO = Molecule('CO',[[0.0, 0.0, 0.0],[ 0.0, 0.0, 1.128]])
H2O = Molecule('HHO',[[0.76,0.00,0.50],[0.76, 0.00,-0.50],[0.0, 0.0, 0.0]])
adsorbates = [O,CO,H2O]

### Make substrates

#### Bulk
Download the bulk structures from the Materials Project

In [13]:
with MPRester("p5vAQV3F1QuxFcxVT") as m:    
    cu = m.get_structure_by_material_id("mp-30")
    mgo = m.get_structure_by_material_id("mp-1265")
    rutile = m.get_structure_by_material_id("mp-2657")
materials = [cu,mgo,rutile]

Ensure the conventional cell is used

In [14]:
bulks = []
substrates = []
for material in materials:
    bulks.append(SpacegroupAnalyzer(material).get_conventional_standard_structure())

#### Slab
Generate the slabs using the pymatgen SlabGenerator function.

In [15]:
substrates =[]
for bulk in bulks:
    slabs = generate_all_slabs(bulk, max_index=1, min_slab_size=6., min_vacuum_size=10.0, 
                                   center_slab=False, symmetrize=True, in_unit_planes=False) 
    #substrates.append(SlabGenerator(bulk, (1,0,0), 2., 10., center_slab=True).get_slab())
    substrates.extend(slabs)

### Adsorb
Place the adsorbate on both surfaces of the slab (symmetric sites)

In [16]:
sub_composition = []
ads_composition = []
miller_indices = []    
n_layers = []
sub_ads_structures = []
for substrate in substrates:
    for adsorbate in adsorbates:
        adsorption_structures = AdsorbateSiteFinder(substrate).adsorb_both_surfaces(adsorbate,repeat=[1,1,1])
        sites = AdsorbateSiteFinder(substrate).find_adsorption_sites()    
        for i,adsorption_structure in enumerate(adsorption_structures):
            miller_indices.append(adsorption_structure.miller_index)
            n_layers.append(len(np.unique(substrate.cart_coords[:,2])))  
            sub_composition.append(substrate.composition.reduced_formula)
            ads_composition.append(str(adsorbate.composition.hill_formula).replace(" ", ""))
            sub_ads_structures.append(adsorption_structure)

### Prepare inputs (geometry optimisation)
Define CRYSTAL input parameters.

In [17]:
#geom_block = ['Adsorption tests\n','EXTERNAL\n','EXTPRT\n','OPTGEOM\n','END\n']
geom_block = ['Adsorption tests\n','EXTERNAL\n','EXTPRT\n']
#bs_block = ['BASISSET\n', 'POB-DZVP\n']
bs_block = ['BASISSET\n', 'STO-3G\n']
func_block = ['DFT\n', 'B3LYP\n', 'XXLGRID\n', 'ENDDFT\n']
scf_block = [['TOLINTEG\n', '5 5 5 5 10\n'],
             ['SHRINK\n', '6 12\n'],
             ['MAXCYCLE\n', '200\n'],
             ['FMIXING\n', '70\n'],
             'DIIS\n',
             'ENDSCF\n']

#### Write inputs
Write the inputs to file.

In [18]:
file_names = []
for i,structure in enumerate(sub_ads_structures):
    input_name = 'data/'+str(sub_composition[i]).replace(" ", "")+'_'+str(ads_composition[i]).replace(" ", "")+'_' \
               +''.join(str(x) for x in substrate.miller_index)+'_'+str(i+1)+'.d12'
    file_names.append(input_name[:-4])
    write_cry_input(input_name,crystal_blocks=[geom_block,bs_block,func_block,scf_block],external_obj=structure)

### Run the calculations
Use the crystal_functions runcry function to execute CRYSTAL (please ensure the path to your runcry17 is defined in execute.runcry()

In [19]:
for cry_input in file_names:
    #runcry(cry_input)
    pass

### Read the optimised geometry
Use the crystal_functions Crystal_output class to extract the final energy.

In [20]:
E_full_system = []
for cry_input in file_names:
    cry_output = Crystal_output(cry_input+'.out')
    if cry_output.converged == True:
        E_full_system.append(cry_output.final_energy()) #Do I need this?

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



EXITING: a .out file needs to be specified
Traceback (most recent call last):
  File "/Users/brunocamino/Desktop/Imperial/cmsg_icl/ml-surface-adsorption/../../crystal-code-tools/crystal_functions/crystal_functions/file_readwrite.py", line 128, in __init__
    file = open(output_name, 'r')
FileNotFoundError: [Errno 2] No such file or directory: 'data/Cu_O_001_1.out'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/brunocamino/miniconda3/envs/cc/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/f2/8kc7y9697m59bwltxjd42y300000gn/T/ipykernel_17960/41481736.py", line 3, in <module>
    cry_output = Crystal_output(cry_input+'.out')
  File "/Users/brunocamino/Desktop/Imperial/cmsg_icl/ml-surface-adsorption/../../crystal-code-tools/crystal_functions/crystal_functions/file_readwrite.py", line 133, in __init__
    sy

TypeError: object of type 'NoneType' has no len()

### Prepare inputs for the BSSE calculations
Generate the inputs for the BSSE calculation. The indices of the GHOST atoms are obtained from the pymatgen.core.structure.Slab object ('adsorbate' and 'substrate').

In [None]:
for i,cry_input in enumerate(file_names):
    if opt_structure[i] != None:
        indices = sub_ads_indices(sub_ads_structures[i])
        #Substrate        
        bsse_sub_inp = Crystal_input(cry_input+'.d12')
        bsse_sub_inp.add_ghost(indices['adsorbate'])
        bsse_sub_inp.opt_to_sp()
        bsse_sub_inp_name = cry_input+'_BSSE_sub.d12'
        write_cry_input(bsse_sub_inp_name,bsse_sub_inp)
        
        shutil.copy(cry_input+'.gui',cry_input+'_BSSE_sub.gui')
        
        #Adsorbate
        bsse_ads_inp = Crystal_input(cry_input+'.d12')
        bsse_ads_inp.add_ghost(indices['substrate'])
        bsse_sub_inp.opt_to_sp()
        bsse_ads_inp_name = cry_input+'_BSSE_ads.d12'
        write_cry_input(bsse_ads_inp_name,bsse_ads_inp)
        
        shutil.copy(cry_input+'.gui',cry_input+'_BSSE_ads.gui')

### Run the BSSE calculations
Use the crystal_functions runcry function to execute CRYSTAL (please ensure the path to your runcry17 is defined in execute.runcry()

In [None]:
E_sub_BSSE = []
E_ads_BSSE = []
for i,cry_input in enumerate(file_names):
    runcry(cry_input+'_BSSE_sub')
    cry_BSSE_sub_output = Crystal_output(cry_input+'_BSSE_sub.out')
    if cry_BSSE_sub_output.converged == True:
        E_sub_BSSE.append(cry_BSSE_sub_output.final_energy())
    
    
    runcry(cry_input+'_BSSE_ads')
    cry_BSSE_ads_output = Crystal_output(cry_input+'_BSSE_ads.out')
    if cry_BSSE_ads_output.converged == True:
        E_ads_BSSE.append(cry_BSSE_ads_output.final_energy())

### Calculate the adsorption energy
Use the crystal_functions cry_ads_energy function to get the adsorption energy (BSSE corrected).

In [None]:
E_adsorption = []
for i in range(len(file_names)):
    E_adsorption.append(cry_ads_energy(E_full_system[i],E_sub_BSSE[i],E_ads_BSSE[i]))

### Create Dataframe

In [None]:
df = pd.DataFrame(list(zip(file_names, sub_composition, n_layers, ads_composition,  miller_indices, E_adsorption,E_sub_BSSE,E_ads_BSSE,E_full_system)),
               columns =['File name', 'Substrate','N layers','Adsorbate','Miller Indices','E adsorption (BSSE)','E sub (ads ghost)','E ads (ads ghost)','E full system'])
df