Skip to content

Commit

Permalink
add to_hdf_kw named arg to dataset.to_pytables, some formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
douglasdavis committed May 22, 2019
1 parent 1c1c13e commit 3ad4f2a
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 100 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
@@ -1,3 +1,3 @@
[tool.black]
line-length = 92
line-length = 94
target-version = ['py36', 'py37']
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -6,3 +6,4 @@ scikit-learn>=0.20
tables>=3.4
numexpr>=2.6.5
h5py>=2.8.0
lz4>=2.1.5
103 changes: 33 additions & 70 deletions twaml/_apps.py
Expand Up @@ -23,80 +23,43 @@ def root2pytables():
"twaml.data.dataset.to_pytables"
)
)
parser.add_argument(
"-i", "--input-files", type=str, nargs="+", required=True, help="input ROOT files"
)
parser.add_argument(
"-n",
"--name",
type=str,
required=True,
help="dataset name (required when reading back into twaml.data.dataset)",
)
parser.add_argument(
"-o",
"--out-file",
type=str,
required=True,
help="Output h5 file (existing file will be overwritten)",
)
parser.add_argument(
"-b",
"--branches",
type=str,
nargs="+",
required=False,
help="branches to save (defaults to all)",
)
parser.add_argument(
"--tree-name", type=str, required=False, default="WtLoop_nominal", help="tree name"
)
parser.add_argument(
"--weight-name",
type=str,
required=False,
default="weight_nominal",
help="weight branch name",
)
parser.add_argument(
"--auxweights",
type=str,
nargs="+",
required=False,
help="extra auxiliary weights to save",
)
parser.add_argument(
"--selection",
type=str,
required=False,
help=(
"A selection string or YAML file containing a map of selections "
"(see `selection` argument docs in `twaml.data.from_root`)"
),
)
parser.add_argument(
"--detect-weights",
action="store_true",
help="detect weights in the dataset, --auxweights overrides this",
)
parser.add_argument(
"--nthreads",
type=int,
default=1,
required=False,
help="number of threads to use via ThreadPoolExecutor",
)
parser.add_argument(
"--aggro-strip",
action="store_true",
help="call the `aggressively_strip()` function on the dataset before saving",
)

# fmt: off
parser.add_argument("-i", "--input-files", type=str, nargs="+", required=True, help="input ROOT files")
parser.add_argument("-n", "--name", type=str, required=True,
help="dataset name (required when reading back into twaml.data.dataset)")
parser.add_argument("-o", "--out-file", type=str, required=True,
help="Output h5 file (existing file will be overwritten)")
parser.add_argument("-b", "--branches", type=str, nargs="+", required=False,
help="branches to save (defaults to all)")
parser.add_argument("--tree-name", type=str, required=False, default="WtLoop_nominal", help="tree name")
parser.add_argument("--weight-name", type=str, required=False, default="weight_nominal", help="weight branch name")
parser.add_argument("--auxweights", type=str, nargs="+", required=False, help="extra auxiliary weights to save")
parser.add_argument("--selection", type=str, required=False,
help=("A selection string or YAML file containing a map of selections "
"(see `selection` argument docs in `twaml.data.from_root`)"))
parser.add_argument("--detect-weights", action="store_true",
help="detect weights in the dataset, --auxweights overrides this")
parser.add_argument("--nthreads", type=int, default=1, required=False,
help="number of threads to use via ThreadPoolExecutor")
parser.add_argument("--aggro-strip", action="store_true",
help="call the `aggressively_strip()` function on the dataset before saving")
parser.add_argument("--table-format", action="store_true",
help="Use the 'table' format keyword when calling DataFrame's to_hdf function")
parser.add_argument("--use-lz4", action="store_true", help="Use lz4 compression")
# fmt: on

args = parser.parse_args()

if not args.out_file.endswith(".h5"):
raise ValueError("--out-file argument must end in .h5")

to_hdf_kw = {}
if args.table_format:
to_hdf_kw["format"] = "table"
if args.use_lz4:
to_hdf_kw["complib"] = "blosc:lz4"

## if selection is not none and is a file ending in .yml or .yaml
## we do the yaml based selections. also a shortcut is implemented
## as a special case
Expand Down Expand Up @@ -132,7 +95,7 @@ def root2pytables():
if args.aggro_strip:
with pd.option_context("mode.chained_assignment", None):
sdv.aggressively_strip()
sdv.to_pytables(f"{anchor}_{sdk}.h5")
sdv.to_pytables(f"{anchor}_{sdk}.h5", to_hdf_kw=to_hdf_hw)
return 0

## otherwise just take the string or None
Expand All @@ -149,6 +112,6 @@ def root2pytables():
aggressively_strip=args.aggro_strip,
wtloop_meta=True,
)
ds.to_pytables(args.out_file)
ds.to_pytables(args.out_file, to_hdf_kw=to_hdf_kw)

return 0
55 changes: 26 additions & 29 deletions twaml/data.py
@@ -1,22 +1,20 @@
# -*- coding: utf-8 -*-

"""This module contains a class (and functions to load it) which
"""
This module contains a class (and functions to load it) which
abstracts datasets using ``pandas.DataFrames`` as the payload for
feeding to machine learning frameworks and other general data
investigating.
"""

import re
from pathlib import PosixPath
from typing import List, Tuple, Optional, Union, Dict, Any
from concurrent.futures import ThreadPoolExecutor
import logging
import uproot
import pandas as pd
import h5py
import numpy as np
import re
import yaml
from pathlib import PosixPath
from typing import List, Tuple, Optional, Union, Dict
from concurrent.futures import ThreadPoolExecutor
import logging

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -113,12 +111,11 @@ def _init_skeleton(
def _combine_wtloop_metas(meta1, meta2) -> Optional[dict]:
if meta1 is not None and meta2 is not None:
return {**meta1, **meta2}
elif meta1 is None and meta2 is not None:
if meta1 is None and meta2 is not None:
return {**meta2}
elif meta1 is not None and meta2 is None:
if meta1 is not None and meta2 is None:
return {**meta1}
else:
return None
return None

def has_payload(self) -> bool:
"""check if dataframe and weights are non empty"""
Expand Down Expand Up @@ -196,7 +193,7 @@ def initial_state(self) -> str:
return "unknown"

init_states = set()
for k, v in self.wtloop_metas.items():
for _, v in self.wtloop_metas.items():
init_states.add(v["initial_state"])
if len(init_states) == 1:
for elem in init_states:
Expand All @@ -215,7 +212,7 @@ def dsid(self) -> int:
return 999999

dsids = set()
for k, v in self.wtloop_metas.items():
for _, v in self.wtloop_metas.items():
dsids.add(v["dsid"])
if len(dsids) == 1:
for elem in dsids:
Expand Down Expand Up @@ -258,8 +255,8 @@ def _set_df_and_weights(
self._auxweights = auxw

def keep_columns(self, cols: List[str]) -> None:

"""Drop all columns not included in ``cols``
"""
Drop all columns not included in ``cols``
Parameters
----------
Expand Down Expand Up @@ -375,7 +372,6 @@ def change_weights(self, wname: str) -> None:
self._auxweights.drop(columns=[wname], inplace=True)

def append(self, other: "dataset") -> None:

"""Append a dataset to an exiting one
We perform concatenations of the dataframes and weights to
Expand Down Expand Up @@ -412,7 +408,7 @@ def append(self, other: "dataset") -> None:
else:
self._auxweights = None

def to_pytables(self, file_name: str) -> None:
def to_pytables(self, file_name: str, to_hdf_kw: Optional[Dict[str, Any]] = None) -> None:
"""Write dataset to disk as a pytables h5 file
This method saves a file using a strict twaml-compatible
Expand All @@ -438,6 +434,8 @@ def to_pytables(self, file_name: str) -> None:
----------
file_name:
output file name,
format:
dict of keyword arguments fed to :meth:`pd.DataFrame.to_hdf`
Examples
--------
Expand All @@ -450,17 +448,17 @@ def to_pytables(self, file_name: str) -> None:
'myds'
"""

if to_hdf_kw is None:
to_hdf_kw = {}
log.info(f"Creating pytables dataset with name '{self.name}' in {file_name}")
log.info(f" selection used: '{self.selection_formula}'")
log.info(f" according to the dataset class the original source was:")
for f in self.files:
log.info(f" - {f}")

for fname in self.files:
log.info(f" - {fname}")
if PosixPath(file_name).exists():
log.warning(f"{file_name} exists, overwriting")
weights_frame = pd.DataFrame(dict(weights=self._weights))
self._df.to_hdf(file_name, f"{self.name}_payload", mode="w")
self._df.to_hdf(file_name, f"{self.name}_payload", mode="w", **to_hdf_kw)
weights_frame.to_hdf(file_name, f"{self.name}_{self.weight_name}", mode="a")
if self._auxweights is not None:
self._auxweights.to_hdf(file_name, f"{self.name}_auxweights", mode="a")
Expand Down Expand Up @@ -585,7 +583,8 @@ def from_root(
wtloop_meta: bool = False,
TeXlabel: Optional[str] = None,
) -> "dataset":
"""Initialize a dataset from ROOT files
"""
Initialize a dataset from ROOT files
Parameters
----------
Expand All @@ -608,7 +607,7 @@ def from_root(
auxlabel:
Give the dataset an integer auxiliary label
allow_weights_in_df:
Allow "^weight_\w+" branches in the payload dataframe
Allow "^weight_\\w+" branches in the payload dataframe
aggressively_strip:
Call :meth:`twaml.data.dataset.aggressively_strip` during construction
auxweights:
Expand Down Expand Up @@ -727,9 +726,7 @@ def from_root(
frame_list.append(raw_f)
if w_branches is not None:
aux_frame_list.append(raw_aw)
assert len(raw_w) == len(
raw_aw
), "aux weight length and weight length different"
assert len(raw_w) == len(raw_aw), "aux weight length and weight length different"

weights_array = np.concatenate(weight_list)
df = pd.concat(frame_list)
Expand Down

0 comments on commit 3ad4f2a

Please sign in to comment.