Skip to content

Commit

Permalink
Refactor cat_dset_3D.py to provide an interface
Browse files Browse the repository at this point in the history
The cat_dset_3D.py script required manual changes to use it. This made
it impossible to use in an automated way and tricky to use manually. To
address this it needed both a CLI interface and a python interface for
calling from within another python script. Both have been added by
moving all the concatenation code into the `concat_3d` function which
handles the python interface and the new `main` function handles the CLI
interface then calls `concat_3d`.

Also adds __pycache__ to gitignore.
  • Loading branch information
bcaddy committed Jun 20, 2023
1 parent 4e56791 commit 72d94ce
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 82 deletions.
6 changes: 1 addition & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ bin/*
*.a
a.out
*.dSYM
__pycache__

# Makefiles #
#############
Expand All @@ -39,11 +40,8 @@ data
out.*
o.*
run


disk.*


# Logs and databases #
######################
*.err
Expand All @@ -67,7 +65,6 @@ disk.*
# OS generated files #
######################
.DS_Store

.remote-sync.json
.remote-sync_macos.json
._*
Expand All @@ -85,4 +82,3 @@ Thumbs.db
#############################
docs/doxygen/build
docs/sphinx/build

209 changes: 132 additions & 77 deletions python_scripts/cat_dset_3D.py
Original file line number Diff line number Diff line change
@@ -1,86 +1,141 @@
#!/usr/bin/env python3
# Example file for concatenating 3D hdf5 datasets
"""
Python script for concatenating 3D hdf5 datasets. Includes a CLI for concatenating Cholla HDF5 datasets and can be
imported into other scripts where the `concat_3d` function can be used to concatenate the datasets.
Generally the easiest way to import this script is to add the `python_scripts` directory to your python path in your
script like this:
```
import sys
sys.path.append('/PATH/TO/CHOLLA/python_scripts')
import cat_dset_3D
```
"""

import h5py
import numpy as np
import argparse
import pathlib

def main():
"""This function handles the CLI argument parsing and is only intended to be used when this script is invoked from the
command line. If you're importing this file then use the `concat_3d` function directly.
"""
# Argument handling
cli = argparse.ArgumentParser()
# Required Arguments
cli.add_argument('-s', '--start_num', type=int, required=True, help='The first output step to concatenate')
cli.add_argument('-e', '--end_num', type=int, required=True, help='The last output step to concatenate')
cli.add_argument('-n', '--num_processes', type=int, required=True, help='The number of processes that were used')
# Optional Arguments
cli.add_argument('-i', '--input_dir', type=pathlib.Path, default=pathlib.Path.cwd(), help='The input directory.')
cli.add_argument('-o', '--output_dir', type=pathlib.Path, default=pathlib.Path.cwd(), help='The output directory.')
args = cli.parse_args()

# Perform the concatenation
concat_3d(start_num=args.start_num,
end_num=args.end_num,
num_processes=args.num_processes,
input_dir=args.input_dir,
output_dir=args.output_dir)


# ======================================================================================================================
def concat_3d(start_num: int,
end_num: int,
num_processes: int,
input_dir: pathlib.Path = pathlib.Path.cwd(),
output_dir: pathlib.Path = pathlib.Path.cwd()):
"""Concatenate 3D HDF5 Cholla datasets. i.e. take the single files generated per process and concatenate them into a
single, large file. All outputs from start_num to end_num will be concatenated.
Args:
start_num (int): The first output step to concatenate
end_num (int): The last output step to concatenate
num_processes (int): The number of processes that were used
input_dir (pathlib.Path, optional): The input directory. Defaults to pathlib.Path.cwd().
output_dir (pathlib.Path, optional): The output directory. Defaults to pathlib.Path.cwd().
"""

# Error checking
assert start_num >= 0, 'start_num must be greater than or equal to 0'
assert end_num >= 0, 'end_num must be greater than or equal to 0'
assert start_num <= end_num, 'end_num should be greater than or equal to start_num'
assert num_processes > 1, 'num_processes must be greater than 1'

# loop over outputs
for n in range(start_num, end_num+1):

ns = 0
ne = 0
n_proc = 16 # number of processors that did the calculations
istart = 0*n_proc
iend = 1*n_proc
dnamein = './hdf5/raw/'
dnameout = './hdf5/'

# loop over outputs
for n in range(ns, ne+1):

# loop over files for a given output
for i in range(istart, iend):

# open the output file for writing (don't overwrite if exists)
fileout = h5py.File(dnameout+str(n)+'.h5', 'a')
# open the input file for reading
filein = h5py.File(dnamein+str(n)+'.h5.'+str(i), 'r')
# read in the header data from the input file
head = filein.attrs

# if it's the first input file, write the header attributes
# and create the datasets in the output file
if (i == 0):
nx = head['dims'][0]
ny = head['dims'][1]
nz = head['dims'][2]
fileout.attrs['dims'] = [nx, ny, nz]
fileout.attrs['gamma'] = [head['gamma'][0]]
fileout.attrs['t'] = [head['t'][0]]
fileout.attrs['dt'] = [head['dt'][0]]
fileout.attrs['n_step'] = [head['n_step'][0]]

units = ['time_unit', 'mass_unit', 'length_unit', 'energy_unit', 'velocity_unit', 'density_unit']
for unit in units:
fileout.attrs[unit] = [head[unit][0]]

d = fileout.create_dataset("density", (nx, ny, nz), chunks=True, dtype=filein['density'].dtype)
mx = fileout.create_dataset("momentum_x", (nx, ny, nz), chunks=True, dtype=filein['momentum_x'].dtype)
my = fileout.create_dataset("momentum_y", (nx, ny, nz), chunks=True, dtype=filein['momentum_y'].dtype)
mz = fileout.create_dataset("momentum_z", (nx, ny, nz), chunks=True, dtype=filein['momentum_z'].dtype)
E = fileout.create_dataset("Energy", (nx, ny, nz), chunks=True, dtype=filein['Energy'].dtype)
# loop over files for a given output
for i in range(0, num_processes):

# open the output file for writing (don't overwrite if exists)
fileout = h5py.File(output_dir / f'{n}.h5', 'a')
# open the input file for reading
filein = h5py.File(input_dir / f'{n}.h5.{i}', 'r')
# read in the header data from the input file
head = filein.attrs

# if it's the first input file, write the header attributes
# and create the datasets in the output file
if (i == 0):
nx = head['dims'][0]
ny = head['dims'][1]
nz = head['dims'][2]
fileout.attrs['dims'] = [nx, ny, nz]
fileout.attrs['gamma'] = [head['gamma'][0]]
fileout.attrs['t'] = [head['t'][0]]
fileout.attrs['dt'] = [head['dt'][0]]
fileout.attrs['n_step'] = [head['n_step'][0]]

units = ['time_unit', 'mass_unit', 'length_unit', 'energy_unit', 'velocity_unit', 'density_unit']
for unit in units:
fileout.attrs[unit] = [head[unit][0]]

d = fileout.create_dataset("density", (nx, ny, nz), chunks=True, dtype=filein['density'].dtype)
mx = fileout.create_dataset("momentum_x", (nx, ny, nz), chunks=True, dtype=filein['momentum_x'].dtype)
my = fileout.create_dataset("momentum_y", (nx, ny, nz), chunks=True, dtype=filein['momentum_y'].dtype)
mz = fileout.create_dataset("momentum_z", (nx, ny, nz), chunks=True, dtype=filein['momentum_z'].dtype)
E = fileout.create_dataset("Energy", (nx, ny, nz), chunks=True, dtype=filein['Energy'].dtype)
try:
GE = fileout.create_dataset("GasEnergy", (nx, ny, nz), chunks=True, dtype=filein['GasEnergy'].dtype)
except KeyError:
print('No Dual energy data present');
try:
bx = fileout.create_dataset("magnetic_x", (nx+1, ny, nz), chunks=True, dtype=filein['magnetic_x'].dtype)
by = fileout.create_dataset("magnetic_y", (nx, ny+1, nz), chunks=True, dtype=filein['magnetic_y'].dtype)
bz = fileout.create_dataset("magnetic_z", (nx, ny, nz+1), chunks=True, dtype=filein['magnetic_z'].dtype)
except KeyError:
print('No magnetic field data present');

# write data from individual processor file to
# correct location in concatenated file
nxl = head['dims_local'][0]
nyl = head['dims_local'][1]
nzl = head['dims_local'][2]
xs = head['offset'][0]
ys = head['offset'][1]
zs = head['offset'][2]
fileout['density'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['density']
fileout['momentum_x'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['momentum_x']
fileout['momentum_y'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['momentum_y']
fileout['momentum_z'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['momentum_z']
fileout['Energy'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['Energy']
try:
GE = fileout.create_dataset("GasEnergy", (nx, ny, nz), chunks=True, dtype=filein['GasEnergy'].dtype)
fileout['GasEnergy'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['GasEnergy']
except KeyError:
print('No Dual energy data present');
print('No Dual energy data present');
try:
bx = fileout.create_dataset("magnetic_x", (nx+1, ny, nz), chunks=True, dtype=filein['magnetic_x'].dtype)
by = fileout.create_dataset("magnetic_y", (nx, ny+1, nz), chunks=True, dtype=filein['magnetic_y'].dtype)
bz = fileout.create_dataset("magnetic_z", (nx, ny, nz+1), chunks=True, dtype=filein['magnetic_z'].dtype)
fileout['magnetic_x'][xs:xs+nxl+1, ys:ys+nyl, zs:zs+nzl] = filein['magnetic_x']
fileout['magnetic_y'][xs:xs+nxl, ys:ys+nyl+1, zs:zs+nzl] = filein['magnetic_y']
fileout['magnetic_z'][xs:xs+nxl, ys:ys+nyl, zs:zs+nzl+1] = filein['magnetic_z']
except KeyError:
print('No magnetic field data present');

# write data from individual processor file to
# correct location in concatenated file
nxl = head['dims_local'][0]
nyl = head['dims_local'][1]
nzl = head['dims_local'][2]
xs = head['offset'][0]
ys = head['offset'][1]
zs = head['offset'][2]
fileout['density'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['density']
fileout['momentum_x'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['momentum_x']
fileout['momentum_y'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['momentum_y']
fileout['momentum_z'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['momentum_z']
fileout['Energy'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['Energy']
try:
fileout['GasEnergy'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['GasEnergy']
except KeyError:
print('No Dual energy data present');
try:
fileout['magnetic_x'][xs:xs+nxl+1, ys:ys+nyl, zs:zs+nzl] = filein['magnetic_x']
fileout['magnetic_y'][xs:xs+nxl, ys:ys+nyl+1, zs:zs+nzl] = filein['magnetic_y']
fileout['magnetic_z'][xs:xs+nxl, ys:ys+nyl, zs:zs+nzl+1] = filein['magnetic_z']
except KeyError:
print('No magnetic field data present');

filein.close()

fileout.close()
print('No magnetic field data present');

filein.close()

fileout.close()
# ======================================================================================================================

if __name__ == '__main__':
main()

0 comments on commit 72d94ce

Please sign in to comment.