add to_hdf_kw named arg to dataset.to_pytables, some formatting

douglasdavis · May 22, 2019 · 3ad4f2a · 3ad4f2a
1 parent 1c1c13e
commit 3ad4f2a
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 100 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,3 @@
 [tool.black]
-line-length = 92
+line-length = 94
 target-version = ['py36', 'py37']
diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,4 @@ scikit-learn>=0.20
 tables>=3.4
 numexpr>=2.6.5
 h5py>=2.8.0
+lz4>=2.1.5
diff --git a/twaml/_apps.py b/twaml/_apps.py
@@ -23,80 +23,43 @@ def root2pytables():
             "twaml.data.dataset.to_pytables"
         )
     )
-    parser.add_argument(
-        "-i", "--input-files", type=str, nargs="+", required=True, help="input ROOT files"
-    )
-    parser.add_argument(
-        "-n",
-        "--name",
-        type=str,
-        required=True,
-        help="dataset name (required when reading back into twaml.data.dataset)",
-    )
-    parser.add_argument(
-        "-o",
-        "--out-file",
-        type=str,
-        required=True,
-        help="Output h5 file (existing file will be overwritten)",
-    )
-    parser.add_argument(
-        "-b",
-        "--branches",
-        type=str,
-        nargs="+",
-        required=False,
-        help="branches to save (defaults to all)",
-    )
-    parser.add_argument(
-        "--tree-name", type=str, required=False, default="WtLoop_nominal", help="tree name"
-    )
-    parser.add_argument(
-        "--weight-name",
-        type=str,
-        required=False,
-        default="weight_nominal",
-        help="weight branch name",
-    )
-    parser.add_argument(
-        "--auxweights",
-        type=str,
-        nargs="+",
-        required=False,
-        help="extra auxiliary weights to save",
-    )
-    parser.add_argument(
-        "--selection",
-        type=str,
-        required=False,
-        help=(
-            "A selection string or YAML file containing a map of selections "
-            "(see `selection` argument docs in `twaml.data.from_root`)"
-        ),
-    )
-    parser.add_argument(
-        "--detect-weights",
-        action="store_true",
-        help="detect weights in the dataset, --auxweights overrides this",
-    )
-    parser.add_argument(
-        "--nthreads",
-        type=int,
-        default=1,
-        required=False,
-        help="number of threads to use via ThreadPoolExecutor",
-    )
-    parser.add_argument(
-        "--aggro-strip",
-        action="store_true",
-        help="call the `aggressively_strip()` function on the dataset before saving",
-    )
+
+    # fmt: off
+    parser.add_argument("-i", "--input-files", type=str, nargs="+", required=True, help="input ROOT files")
+    parser.add_argument("-n", "--name", type=str, required=True,
+                        help="dataset name (required when reading back into twaml.data.dataset)")
+    parser.add_argument("-o", "--out-file", type=str, required=True,
+                        help="Output h5 file (existing file will be overwritten)")
+    parser.add_argument("-b", "--branches", type=str, nargs="+", required=False,
+                        help="branches to save (defaults to all)")
+    parser.add_argument("--tree-name", type=str, required=False, default="WtLoop_nominal", help="tree name")
+    parser.add_argument("--weight-name", type=str, required=False, default="weight_nominal", help="weight branch name")
+    parser.add_argument("--auxweights", type=str, nargs="+", required=False, help="extra auxiliary weights to save")
+    parser.add_argument("--selection", type=str, required=False,
+                        help=("A selection string or YAML file containing a map of selections "
+                              "(see `selection` argument docs in `twaml.data.from_root`)"))
+    parser.add_argument("--detect-weights", action="store_true",
+                        help="detect weights in the dataset, --auxweights overrides this")
+    parser.add_argument("--nthreads", type=int, default=1, required=False,
+                        help="number of threads to use via ThreadPoolExecutor")
+    parser.add_argument("--aggro-strip", action="store_true",
+                        help="call the `aggressively_strip()` function on the dataset before saving")
+    parser.add_argument("--table-format", action="store_true",
+                        help="Use the 'table' format keyword when calling DataFrame's to_hdf function")
+    parser.add_argument("--use-lz4", action="store_true", help="Use lz4 compression")
+    # fmt: on
 
     args = parser.parse_args()
 
     if not args.out_file.endswith(".h5"):
         raise ValueError("--out-file argument must end in .h5")
 
+    to_hdf_kw = {}
+    if args.table_format:
+        to_hdf_kw["format"] = "table"
+    if args.use_lz4:
+        to_hdf_kw["complib"] = "blosc:lz4"
+
     ## if selection is not none and is a file ending in .yml or .yaml
     ## we do the yaml based selections. also a shortcut is implemented
     ## as a special case
@@ -132,7 +95,7 @@ def root2pytables():
             if args.aggro_strip:
                 with pd.option_context("mode.chained_assignment", None):
                     sdv.aggressively_strip()
-            sdv.to_pytables(f"{anchor}_{sdk}.h5")
+            sdv.to_pytables(f"{anchor}_{sdk}.h5", to_hdf_kw=to_hdf_hw)
         return 0
 
     ## otherwise just take the string or None
@@ -149,6 +112,6 @@ def root2pytables():
         aggressively_strip=args.aggro_strip,
         wtloop_meta=True,
     )
-    ds.to_pytables(args.out_file)
+    ds.to_pytables(args.out_file, to_hdf_kw=to_hdf_kw)
 
     return 0
diff --git a/twaml/data.py b/twaml/data.py
@@ -1,22 +1,20 @@
-# -*- coding: utf-8 -*-
-
-"""This module contains a class (and functions to load it) which
+"""
+This module contains a class (and functions to load it) which
 abstracts datasets using ``pandas.DataFrames`` as the payload for
 feeding to machine learning frameworks and other general data
 investigating.
-
 """
 
+import re
+from pathlib import PosixPath
+from typing import List, Tuple, Optional, Union, Dict, Any
+from concurrent.futures import ThreadPoolExecutor
+import logging
 import uproot
 import pandas as pd
 import h5py
 import numpy as np
-import re
 import yaml
-from pathlib import PosixPath
-from typing import List, Tuple, Optional, Union, Dict
-from concurrent.futures import ThreadPoolExecutor
-import logging
 
 log = logging.getLogger(__name__)
 
@@ -113,12 +111,11 @@ def _init_skeleton(
     def _combine_wtloop_metas(meta1, meta2) -> Optional[dict]:
         if meta1 is not None and meta2 is not None:
             return {**meta1, **meta2}
-        elif meta1 is None and meta2 is not None:
+        if meta1 is None and meta2 is not None:
             return {**meta2}
-        elif meta1 is not None and meta2 is None:
+        if meta1 is not None and meta2 is None:
             return {**meta1}
-        else:
-            return None
+        return None
 
     def has_payload(self) -> bool:
         """check if dataframe and weights are non empty"""
@@ -196,7 +193,7 @@ def initial_state(self) -> str:
             return "unknown"
 
         init_states = set()
-        for k, v in self.wtloop_metas.items():
+        for _, v in self.wtloop_metas.items():
             init_states.add(v["initial_state"])
         if len(init_states) == 1:
             for elem in init_states:
@@ -215,7 +212,7 @@ def dsid(self) -> int:
             return 999999
 
         dsids = set()
-        for k, v in self.wtloop_metas.items():
+        for _, v in self.wtloop_metas.items():
             dsids.add(v["dsid"])
         if len(dsids) == 1:
             for elem in dsids:
@@ -258,8 +255,8 @@ def _set_df_and_weights(
             self._auxweights = auxw
 
     def keep_columns(self, cols: List[str]) -> None:
-
-        """Drop all columns not included in ``cols``
+        """
+        Drop all columns not included in ``cols``
 
         Parameters
         ----------
@@ -375,7 +372,6 @@ def change_weights(self, wname: str) -> None:
         self._auxweights.drop(columns=[wname], inplace=True)
 
     def append(self, other: "dataset") -> None:
-
         """Append a dataset to an exiting one
 
         We perform concatenations of the dataframes and weights to
@@ -412,7 +408,7 @@ def append(self, other: "dataset") -> None:
         else:
             self._auxweights = None
 
-    def to_pytables(self, file_name: str) -> None:
+    def to_pytables(self, file_name: str, to_hdf_kw: Optional[Dict[str, Any]] = None) -> None:
         """Write dataset to disk as a pytables h5 file
 
         This method saves a file using a strict twaml-compatible
@@ -438,6 +434,8 @@ def to_pytables(self, file_name: str) -> None:
         ----------
         file_name:
           output file name,
+        format:
+          dict of keyword arguments fed to :meth:`pd.DataFrame.to_hdf`
 
         Examples
         --------
@@ -450,17 +448,17 @@ def to_pytables(self, file_name: str) -> None:
         'myds'
 
         """
-
+        if to_hdf_kw is None:
+            to_hdf_kw = {}
         log.info(f"Creating pytables dataset with name '{self.name}' in {file_name}")
         log.info(f"  selection used: '{self.selection_formula}'")
         log.info(f"  according to the dataset class the original source was:")
-        for f in self.files:
-            log.info(f"   - {f}")
-
+        for fname in self.files:
+            log.info(f"   - {fname}")
         if PosixPath(file_name).exists():
             log.warning(f"{file_name} exists, overwriting")
         weights_frame = pd.DataFrame(dict(weights=self._weights))
-        self._df.to_hdf(file_name, f"{self.name}_payload", mode="w")
+        self._df.to_hdf(file_name, f"{self.name}_payload", mode="w", **to_hdf_kw)
         weights_frame.to_hdf(file_name, f"{self.name}_{self.weight_name}", mode="a")
         if self._auxweights is not None:
             self._auxweights.to_hdf(file_name, f"{self.name}_auxweights", mode="a")
@@ -585,7 +583,8 @@ def from_root(
     wtloop_meta: bool = False,
     TeXlabel: Optional[str] = None,
 ) -> "dataset":
-    """Initialize a dataset from ROOT files
+    """
+    Initialize a dataset from ROOT files
 
     Parameters
     ----------
@@ -608,7 +607,7 @@ def from_root(
     auxlabel:
         Give the dataset an integer auxiliary label
     allow_weights_in_df:
-        Allow "^weight_\w+" branches in the payload dataframe
+        Allow "^weight_\\w+" branches in the payload dataframe
     aggressively_strip:
         Call :meth:`twaml.data.dataset.aggressively_strip` during construction
     auxweights:
@@ -727,9 +726,7 @@ def from_root(
         frame_list.append(raw_f)
         if w_branches is not None:
             aux_frame_list.append(raw_aw)
-            assert len(raw_w) == len(
-                raw_aw
-            ), "aux weight length and weight length different"
+            assert len(raw_w) == len(raw_aw), "aux weight length and weight length different"
 
     weights_array = np.concatenate(weight_list)
     df = pd.concat(frame_list)