In [None]:
# ------  CASTEP INPUT FILE GENERATOR ------
from pathlib import Path
import re
from ase.io import read
from ase.calculators.castep import Castep
from ase.io.castep import write_castep_cell, write_param
import numpy as np

# Generate CASTEP file name using unique_key + dump file timestep 
# e.g. C_GAP17_NVT_64_1.5_1_01000
# Note: assumes file path structure ".../C_GAP17_NVT_64_1.5_1/NVT/dump_custom.C.01000.dat"

calc = Castep()

# set CASTEP parameters 
calc.param.task                = 'energy'
calc.param.cut_off_energy      = 520          
calc.param.xc_functional       = 'PBE'
calc.param.SPIN_POLARIZED      = 'TRUE'
calc.param.MAX_SCF_CYCLES      = 100
calc.param.ELEC_ENERGY_TOL     = '5e-5'       
calc.param.FIX_OCCUPANCY       = 'TRUE'
calc.param.POPN_CALCULATE      = 'FALSE'
calc.param.OPT_STRATEGY        = 'SPEED'
#calc.param.ELECTRONIC_MINIMIZER = 'RMM/DIIS'

kpoint_mp_spacing = 0.2

def Castep_input_file_generator(lammps_input_file_path, calc):

    path_components = Path(lammps_input_file_path).parts

    unique_key = path_components[-3]

    unique_key_pattern = re.compile(
    r'^(?P<element_symbol>[A-Za-z]{1,6})_'          # e.g. C
    r'(?P<potential_name>[^_]+)_'                   # e.g. GAP17
    r'(?P<simulation_type>[^_]+)_'                  # e.g. NVT
    r'(?P<num_atoms>\d+)_'                          # e.g. 64
    r'(?P<density>[\d.eE+-]+)_'                     # e.g. 1.5 or 1.85e+00
    r'(?P<run>\d+)'                                 # e.g. 1 (run number) 
    )

    m = unique_key_pattern.match(unique_key)
    if not m:
        raise ValueError(f"Invalid unique_key name format: {unique_key}")
        
    element_symbol = m.group(1)

    dump_file_name = path_components[-1]
    timestep = re.search(r'(\d+)', dump_file_name).group(1)
    castep_unique_key = f"{unique_key}_{timestep}"

    # Read lammps dump file
    lammps_dump_file = read(lammps_input_file_path, format='lammps-dump-text', specorder = [element_symbol])

    CASTEP_input_dir = Path("CASTEP/Input_files")
    CASTEP_input_dir.mkdir(parents=True, exist_ok=True)

    cell_file_name = f"{castep_unique_key}.cell"
    param_file_name = f"{castep_unique_key}.param"

    cell_file_path = CASTEP_input_dir / cell_file_name
    param_file_path = CASTEP_input_dir / param_file_name

    write_castep_cell(cell_file_path, lammps_dump_file, positions_frac=False, force_write=True, precision=6, magnetic_moments=None)
    
    with open(cell_file_path, 'a') as f:
        f.write(f"KPOINT_MP_SPACING {kpoint_mp_spacing} 1/ang\n")
        
    write_param(param_file_path, calc.param, force_write=True)

    print(f"CASTEP cell file created: {cell_file_name}")
    print(f"CASTEP param file created: {param_file_name}")


# Create all dump files in the specified densities, runs and timestepsimport ase; print(ase.__version__)

# Generate Fine Tuning Set
# 3 snapshots for 9000K, 5000K, quench and 300K
# 10 repeats of 10 densities (1.25 - 3.5)
densities = []
for i in np.arange(1.25, 3.75, 0.25):
    densities.append(i)

runs = []
for i in range (1,11):
    runs.append(i)


timesteps = [1000,2000,3000, 4000,5000,6000, 6100,6300,6500, 7500,8500,9500]
num_atoms = 64
potential = "GAP17"


for density in densities:
    density = f"{density:.2f}"
    for run in runs:
        for timestep in timesteps:
            padded_timestep_str = f"{timestep:05d}"

            lammps_input_file_path =  (f"LAMMPS_simulations/Element: Carbon/Potential: {potential}/Type: NVT/"
                f"Atoms: {num_atoms}/Density: {density}/"
                f"C_{potential}_NVT_{num_atoms}_{density}_{str(run)}/"
                f"NVT/dump_custom.C.{padded_timestep_str}.dat")
            
            Castep_input_file_generator(lammps_input_file_path, calc)

# Generate Energy and Force Validation Set
# Amorphous Structures (300 K) and Liquid Structures (5000 K)
# 10 repeats of 21 densities (1.5 - 3.5)
densities = []
for i in np.arange(1.5, 3.5, 0.1):
    densities.append(i)

runs = []
for i in range (1,11):
    runs.append(i)


timesteps = [6000, 9500]
num_atoms = 216
potential = "GAP17"


for density in densities:
    density = f"{density:.2f}"
    for run in runs:
        for timestep in timesteps:
            padded_timestep_str = f"{timestep:05d}"

            lammps_input_file_path =  (f"LAMMPS_simulations/Element: Carbon/Potential: {potential}/Type: NVT/"
                f"Atoms: {num_atoms}/Density: {density}/"
                f"C_{potential}_NVT_{num_atoms}_{density}_{str(run)}/"
                f"NVT/dump_custom.C.{padded_timestep_str}.dat")
            
            Castep_input_file_generator(lammps_input_file_path, calc)

