In [1]:
import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

<!-- #region -->
# Custom Functions
It is possible to deploy your own custom function when it comes to `row_apply` or
`column_apply`. Let's show some examples.


## Row apply

The `row_apply` function is where the `row_func` will receive each row as the
input arguments.

### Lambda Functions
In most cases, you can rely on a lambda function if you wish to do this on the fly:
<!-- #endregion -->

In [2]:
from data_fabricator.v1.core.mock_generator import (
    MockDataGenerator,
    BaseTable,
    UniqueId,
    RowApply,
)


class SampleTable(BaseTable):
    num_rows = 10
    id = UniqueId()
    id_plus_one = RowApply(
        list_of_values="SampleTable.id", row_func="lambda x: int(x)+1"
    )
    id_plus_id = RowApply(
        list_of_values=["SampleTable.id", "SampleTable.id_plus_one"],
        row_func="lambda *args: int(args[0]) + args[1]",
    )


mock_generator = MockDataGenerator(tables=[SampleTable], seed=1)
mock_generator.generate_all()

print(mock_generator.tables["SampleTable"].dataframe.head(10))

   id  id_plus_one  id_plus_id
0   1            2           3
1   2            3           5
2   3            4           7
3   4            5           9
4   5            6          11
5   6            7          13
6   7            8          15
7   8            9          17
8   9           10          19
9  10           11          21


Notice we can also pass in multiple column's row's into our custom function using
the list syntax in the `list_of_values`. Also that `generate_unique_id` returns a
string, so we need to convert it to an integer before doing any arithmatic.

### Custom Functions
If you wish to write more involved functions, you may do like the following:

In [3]:
import yaml


def my_custom_row_func1(x):
    # this can be anything in here
    print(f"In my_custom_row_func1, x: {x}")
    return int(x) + 1


def my_custom_row_func2(x, y):
    # how to pass more than 1 argument
    print(f"In my_custom_row_func2, x: {x}")
    print(f"In my_custom_row_func2, y: {y}")
    return int(x) + y


config_str = """
tables:
- _target_: data_fabricator.v1.core.mock_generator.create_table
  name: sample_table
  num_rows: 10
  columns:
    id:
      _target_: data_fabricator.v1.core.mock_generator.UniqueId
    id_plus_one:
      _target_: data_fabricator.v1.core.mock_generator.RowApply
      list_of_values: sample_table.id
      row_func:
        _target_: __main__.my_custom_row_func1
        _partial_: True
    id_plus_id:
      _target_: data_fabricator.v1.core.mock_generator.RowApply
      list_of_values: [sample_table.id, sample_table.id_plus_one]
      row_func:
        _target_: __main__.my_custom_row_func2
        _partial_: True
"""
example_config = yaml.safe_load(config_str)

In [4]:
from data_fabricator.v1.nodes.hydra import hydra_instantiate_dictionary

# function to return injected objects

example_config = hydra_instantiate_dictionary(example_config)

In [5]:
from data_fabricator.v1.core.mock_generator import MockDataGenerator

mock_generator = MockDataGenerator(tables=example_config["tables"], seed=1)
mock_generator.generate_all()

print(mock_generator.tables)

In my_custom_row_func1, x: 1
In my_custom_row_func1, x: 2
In my_custom_row_func1, x: 3
In my_custom_row_func1, x: 4
In my_custom_row_func1, x: 5
In my_custom_row_func1, x: 6
In my_custom_row_func1, x: 7
In my_custom_row_func1, x: 8
In my_custom_row_func1, x: 9
In my_custom_row_func1, x: 10
In my_custom_row_func2, x: 1
In my_custom_row_func2, y: 2
In my_custom_row_func2, x: 2
In my_custom_row_func2, y: 3
In my_custom_row_func2, x: 3
In my_custom_row_func2, y: 4
In my_custom_row_func2, x: 4
In my_custom_row_func2, y: 5
In my_custom_row_func2, x: 5
In my_custom_row_func2, y: 6
In my_custom_row_func2, x: 6
In my_custom_row_func2, y: 7
In my_custom_row_func2, x: 7
In my_custom_row_func2, y: 8
In my_custom_row_func2, x: 8
In my_custom_row_func2, y: 9
In my_custom_row_func2, x: 9
In my_custom_row_func2, y: 10
In my_custom_row_func2, x: 10
In my_custom_row_func2, y: 11
{'sample_table': sample_table()}


In this particular case the function is defined under `__main__`, but similarly you
can put in the full function path in your project. For example
`row_func: my_project.module.function_file.function`.

This pattern allows you to unit test specific functions if the logic is more involved.

## Column Apply
In `column_apply`, the entire column will be passed into the function, which grants
you access to values across rows, which may be useful in some situations like
grouping or sorting.

In [6]:
from data_fabricator.v1.core.mock_generator import MockDataGenerator, ColumnApply
import yaml


def my_custom_col_func1(x):
    print(f"In my_custom_col_func1, x: {x}")
    return x


def my_custom_col_func2(x, y):
    print(f"In my_custom_col_func2, x: {x}")
    print(f"In my_custom_col_func2, y: {y}")
    return x


def reverse_values(x):
    x = [int(_) for _ in x]
    x.sort(reverse=True)
    return x


class SampleTable(BaseTable):
    num_rows = 10
    id = UniqueId()
    example_column_apply1 = ColumnApply(
        list_of_values=["SampleTable.id"], column_func=my_custom_col_func1
    )
    example_column_apply2 = ColumnApply(
        list_of_values=["SampleTable.id", "SampleTable.example_column_apply1"],
        column_func=my_custom_col_func2,
    )
    example_column_apply3 = ColumnApply(
        list_of_values=["SampleTable.id"], column_func=reverse_values
    )


mock_generator = MockDataGenerator(tables=[SampleTable], seed=1)
mock_generator.generate_all()

print(mock_generator.tables["SampleTable"].dataframe.head(10))

In my_custom_col_func1, x: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
In my_custom_col_func2, x: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
In my_custom_col_func2, y: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
   id example_column_apply1 example_column_apply2  example_column_apply3
0   1                     1                     1                     10
1   2                     2                     2                      9
2   3                     3                     3                      8
3   4                     4                     4                      7
4   5                     5                     5                      6
5   6                     6                     6                      5
6   7                     7                     7                      4
7   8                     8                     8                      3
8   9                     9                     9                      2
9  10                    10      

Notice the difference between `column_apply` and `row_apply`. The `row_apply` function is
called 10 times, equivalent to the number of rows, but the `column_apply` is called once.