Skip to content

Commit

Permalink
doc: formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
cwoac committed Nov 25, 2023
1 parent fda2e78 commit 9d01fbb
Show file tree
Hide file tree
Showing 15 changed files with 731 additions and 241 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# changelog
# Changelog



Expand Down
4 changes: 1 addition & 3 deletions src/doenut/data/data_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@


class DataSet:
"""
A dataset that has had all it's modifiers applied.
"""
"""A dataset that has had all it's modifiers applied."""

def __init__(self, inputs, responses):
self.inputs = inputs
Expand Down
92 changes: 71 additions & 21 deletions src/doenut/data/modifiable_data_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@


class ModifiableDataSet:
"""
Typically when doing DoE you will want to apply various modifiers such as
"""Typically when doing DoE you will want to apply various modifiers such as
scaling or filtering of columns to your dataset. ModifiableDataSet is
DoENUT's mechanism to provide this.
Expand All @@ -40,6 +39,13 @@ class ModifiableDataSet:
per the builder pattern - i.e. so you can write code like:
C{dataset = ModifiableDataset(inputs,responses).filter(list).scale()}
Parameters
----------
Returns
-------
"""

def __init__(self, inputs: pd.DataFrame, responses: pd.DataFrame) -> None:
Expand All @@ -65,11 +71,22 @@ def get(self) -> DataSet:
def add_modifier(
self, modifier: Type["DataSetModifier"], **kwargs
) -> "ModifiableDataSet":
"""
Adds a new modifier to the stack.
"""Adds a new modifier to the stack.
Parameters
----------
modifier :
The new modifier to add
kwargs :
Any additional arguments the modifier is expecting.
modifier: Type["DataSetModifier"] :
**kwargs :
Returns
-------
@param modifier: The new modifier to add
@param kwargs: Any additional arguments the modifier is expecting.
"""
logger.info(f"Applying {modifier} to dataset")
modifier = modifier(self._proc_inputs, self._proc_responses, **kwargs)
Expand All @@ -85,15 +102,27 @@ def filter(
input_selector: List["str | int"] = None,
response_selector: List["str | int"] = None,
) -> "ModifiableDataSet":
"""
Select a subset of the columns in this dataset.
"""Select a subset of the columns in this dataset.
You must specify at least one selector.
Each select selector can be either a list of column names or indices
that you wish to keep.
@param input_selector: Filter for the input data
@param response_selector: Filter for the response data
@return: this dataset
Parameters
----------
input_selector :
Filter for the input data
response_selector :
Filter for the response data
input_selector: List["str | int"] :
(Default value = None)
response_selector: List["str | int"] :
(Default value = None)
Returns
-------
type
this dataset
"""
return self.add_modifier(
ColumnSelector,
Expand All @@ -102,35 +131,56 @@ def filter(
)

def scale(self, scale_responses: bool = False) -> "ModifiableDataSet":
"""
Apply an orthographic scaling to the dataset
"""Apply an orthographic scaling to the dataset
i.e. apply a linear scaling so each column is in the range -1...1
@param scale_responses: Whether to scale the response data as well
@return: this dataset
Parameters
----------
scale_responses :
Whether to scale the response data as well
scale_responses: bool :
(Default value = False)
Returns
-------
type
this dataset
"""
return self.add_modifier(OrthoScaler, scale_responses=scale_responses)

def drop_duplicates(self) -> "ModifiableDataSet":
"""
Removes all duplicate rows from the dataset. The first instance of
"""Removes all duplicate rows from the dataset. The first instance of
each duplicate will be kept.
NOTE: while only the inputs are considered for whether a row is a
duplicate or now, duplicates will be removed from both inputs and
responses.
@return: self
Parameters
----------
Returns
-------
type
self
"""
return self.add_modifier(DuplicateRemover)

def average_duplicates(self) -> "ModifiableDataSet":
"""
Removes all duplicate rows from the dataset. The first instance of
"""Removes all duplicate rows from the dataset. The first instance of
each duplicate will be kept, and it's responses set to the average of
all the rows that matched it.
NOTE: only inputs values are considered for whether a row is a
duplicate or not
@return: self
Parameters
----------
Returns
-------
type
self
"""
return self.add_modifier(DuplicateAverager)
48 changes: 32 additions & 16 deletions src/doenut/data/modifiers/column_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,46 @@
class ColumnSelector(DataSetModifier):
"""
DataSet Modifier to remove columns from the dataset
Parameters
----------
inputs : pd.DataFrame
The dataset's inputs
responses : pd.DataFrame
The dataset's responses
input_selector : List["str | int"], optional
A list to filter the inputs by
response_selector : List["str | int"], optional
A list to filter the responses by
Warnings
--------
At least one of ``input_selector`` and ``response_selector`` must be specified.
"""


@classmethod
def _parse_selector(
cls, data: pd.DataFrame, selector: List["str | int"]
) -> Tuple[List[str], List[int]]:
"""
Internal helper function to take either a list of column names or
"""Internal helper function to take either a list of column names or
column indices and convert it to the other.
@param data: The data set the list applies to
@param selector: The known selector list
@return: Tuple of the column names and indices as lists
Parameters
----------
data : pd.DataFrame
The data set the list applies to
selector : List["str | int"]
The known selector list
Returns
-------
List[str]:
The list of column names selected
List[int]:
The list of column indices selected
"""
if isinstance(selector[0], str): # columns provided
# First validate it
Expand All @@ -43,17 +70,6 @@ def __init__(
input_selector: List["str | int"] = None,
response_selector: List["str | int"] = None,
):
"""
Used to filter specific columns from a dataset. Note that at least one
of input_selector and response_selector must be specified.
The selector should be a list of either column names or column indices
@param inputs: The dataset's inputs
@param responses: The dataset's responses
@param input_selector: A list to filter the inputs by
@param response_selector: A list to filter the responses by
"""
super().__init__(inputs, responses)
# Validate inputs
if input_selector is None and response_selector is None:
Expand Down
52 changes: 34 additions & 18 deletions src/doenut/data/modifiers/data_set_modifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,41 +3,57 @@


class DataSetModifier(ABC):
"""
Parent class for all types of modifier.
"""Parent class for all types of modifier.
They take a dataset in, perform some form of operation on it and then
pass it along
Parameters
----------
inputs : pd.DataFrame
The dataset's inputs
responses : pd.DataFrame
The dataset's responses
\*\*kwargs : dict, optional
Any extra arguments needed by individual modifiers.
Note
----
This is an abstract class and should not be used directly.
"""

def __init__(
self, inputs: pd.DataFrame, responses: pd.DataFrame, **kwargs
):
"""
Does nothing, but defines the constructor for all other DataSets
@param inputs: the processed inputs up till this point
@param responses: the processed responses up till this point
Use this to do things like check the size and ranges of the dataset.
@param kwargs: any other arguments the modifier needs
"""
pass

@abstractmethod
def apply_to_inputs(self, data: pd.DataFrame) -> pd.DataFrame:
"""
Applies the modifier to the inputs of the dataset.
"""Applies the modifier to the inputs of the dataset.
Parameters
----------
data : pd.DataFrame
The input data
@param data: The input data
@return: The data post modification.
Returns
-------
pd.DataFrame:
The modified input data
"""
pass

@abstractmethod
def apply_to_responses(self, data: pd.DataFrame) -> pd.DataFrame:
"""
Applies the modifier to the responses of the dataset.
"""Applies the modifier to the responses of the dataset.
Parameters
----------
data : pd.DataFrame
The response data
@param data: The response data
@return: The data post modification.
Returns
-------
pd.DataFrame:
The modified response data
"""
pass
21 changes: 9 additions & 12 deletions src/doenut/data/modifiers/duplicate_averager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,17 @@


class DuplicateAverager(DuplicateRemover):
"""
Parses a dataset and removes all but the _first_ instance of any row that
has duplicate values for the _inputs_. Will also remove the corresponding
"""Parses a dataset and removes all but the *first* instance of any row that
has duplicate values for the *inputs*. Will also remove the corresponding
row in the responses, replacing the remaining response with the averages
of the duplicates' values.
Parameters
----------
inputs : pd.DataFrame
The dataset's inputs
responses : pd.DataFrame
The dataset's responses
"""

@classmethod
Expand All @@ -28,15 +34,6 @@ def _apply(
return results

def __init__(self, inputs: pd.DataFrame, responses: pd.DataFrame) -> None:
"""
This modifier will remove all rows from the dataset which have
identical values for the I{inputs}, and set the response value to be
the average of all the duplicates. The first instance in the dataset
of a given set of values will be the one retained.
@param inputs: The inputs of the dataset
@param responses: The responses of the dataset
"""
super().__init__(inputs, responses)

def apply_to_inputs(self, data: pd.DataFrame) -> pd.DataFrame:
Expand Down
20 changes: 9 additions & 11 deletions src/doenut/data/modifiers/duplicate_remover.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,16 @@


class DuplicateRemover(DataSetModifier):
"""
Parses a dataset and removes all but the _first_ instance of any row that
has duplicate values for the _inputs_. Will also remove the corresponding
"""Parses a dataset and removes all but the *first* instance of any row that
has duplicate values for the *inputs*. Will also remove the corresponding
row in the responses.
Parameters
----------
inputs : pd.DataFrame
The dataset's inputs
responses : pd.DataFrame
The dataset's responses
"""

@classmethod
Expand Down Expand Up @@ -50,14 +56,6 @@ def _get_non_duplicate_rows(
return non_duplicates

def __init__(self, inputs: pd.DataFrame, responses: pd.DataFrame) -> None:
"""
This modifier will remove all rows from the dataset which have
identical values for the _inputs_. The first instance in the dataset
of a given set of values will be the one retained.
@param inputs: The inputs of the dataset
@param responses: The responses of the dataset
"""
super().__init__(inputs, responses)
# use input data to determine which rows are duplicates

Expand Down
Loading

0 comments on commit 9d01fbb

Please sign in to comment.