Skip to content

Commit

Permalink
add dataset __getitem__ for masking/indexing
Browse files Browse the repository at this point in the history
have twaml-root2pytables use it (decrease memory usage)
  • Loading branch information
douglasdavis committed May 28, 2019
1 parent c3b6c64 commit db65100
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 15 deletions.
11 changes: 6 additions & 5 deletions twaml/_apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,14 @@ def root2pytables():
wtloop_meta=True,
)

selected_dses = full_ds.apply_selections(selection_yaml)
selected_masks = full_ds.selection_masks(selection_yaml)
anchor = args.out_file.split(".h5")[0]
for sdk, sdv in selected_dses.items():
for sdk, sdv in selected_masks.items():
temp_ds = full_ds[sdv]
if args.aggro_strip:
with pd.option_context("mode.chained_assignment", None):
sdv.aggressively_strip()
sdv.to_pytables(f"{anchor}_{sdk}.h5", to_hdf_kw=to_hdf_hw)
temp_ds.aggressively_strip()
temp_ds.to_pytables(f"{anchor}_{sdk}.h5", to_hdf_kw=to_hdf_kw)
del temp_ds
return 0

## otherwise just take the string or None
Expand Down
62 changes: 52 additions & 10 deletions twaml/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,16 +107,6 @@ def _init_skeleton(
self.auxlabel = auxlabel
self.TeXlabel = TeXlabel

@staticmethod
def _combine_wtloop_metas(meta1, meta2) -> Optional[dict]:
if meta1 is not None and meta2 is not None:
return {**meta1, **meta2}
if meta1 is None and meta2 is not None:
return {**meta2}
if meta1 is not None and meta2 is None:
return {**meta1}
return None

def has_payload(self) -> bool:
"""check if dataframe and weights are non empty"""
has_df = not self._df.empty
Expand Down Expand Up @@ -244,6 +234,28 @@ def __str__(self) -> str:
"""standard str"""
return f"dataset(name={self.name})"

def __getitem__(self, idx) -> "dataset":
"""get subset based on boolean mask or array of indices"""
new_df = self._df[idx]
new_w = self._weights[idx]
if self._auxweights is not None:
new_aw = self._auxweights[idx]
else:
new_aw = None
new_ds = dataset()
new_ds._init_skeleton(
self.files,
self.name,
weight_name=self.weight_name,
tree_name=self.tree_name,
label=self.label,
auxlabel=self.auxlabel,
TeXlabel=self.TeXlabel,
)
new_ds.wtloop_metas = self.wtloop_metas
new_ds._set_df_and_weights(new_df, new_w, auxw=new_aw)
return new_ds

def _set_df_and_weights(
self, df: pd.DataFrame, w: np.ndarray, auxw: Optional[pd.DataFrame] = None
) -> None:
Expand All @@ -254,6 +266,16 @@ def _set_df_and_weights(
assert len(df) == len(auxw), "unequal length df and auxw weights"
self._auxweights = auxw

@staticmethod
def _combine_wtloop_metas(meta1, meta2) -> Optional[dict]:
if meta1 is not None and meta2 is not None:
return {**meta1, **meta2}
if meta1 is None and meta2 is not None:
return {**meta2}
if meta1 is not None and meta2 is None:
return {**meta1}
return None

def keep_columns(self, cols: List[str]) -> None:
"""
Drop all columns not included in ``cols``
Expand Down Expand Up @@ -512,6 +534,26 @@ def __add__(self, other: "dataset") -> "dataset":
new_ds._set_df_and_weights(new_df, new_weights, auxw=new_aw)
return new_ds

def selection_masks(self, selections: Dict[str, str]) -> Dict[str, np.ndarray]:
"""Based on a dictionary of selections, calculate masks (boolean
ararys) for each selection
Parameters
----------
selections:
Dictionary of selections in the form ``{ name : selection }``.
Returns
-------
Dict[str, dataset]
A dictionary of ``{ selection name : bool array }`` satisfying the selections
"""
masks = {}
for sel_key, sel_val in selections.items():
masks[sel_key] = np.asarray(self.df.eval(sel_val))
return masks

def apply_selections(self, selections: Dict[str, str]) -> Dict[str, "dataset"]:
"""Based on a dictionary of selections, break the dataset into a set
of multiple (finer grained) datasets.
Expand Down

0 comments on commit db65100

Please sign in to comment.