In [1]:
import random

random.seed(1)

import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

# Additional Functionality

This document will highlight the extended functionality and some new functions of fabricator.

## Setting Seed
Sometimes, we like the data to be exactly the same. In such cases, we can set the `seed`
when instantiating the `MockDataGenerator` class. However, we recommend that for end-to-end
testing to avoid setting this seed just in case your pipeline logic somehow relies on the
exact ordering of rows, see this [page](https://hypothesis.readthedocs.io/en/latest/)
for reasons not to order your data exactly each time.

In [2]:
import yaml

yaml_string = """
     accounts:
       num_rows: 10
       columns:
         id:
           type: generate_unique_id
           id_start_range: 0
           id_end_range: 10
           id_length: 4
         random:
           type: numpy_random
           distribution: binomial
           n: 1
           p: 0.5
         member_id:
           type: generate_unique_id
           prefix: mem_
           id_start_range: 0
           id_end_range: 5000
           id_length: 10
         period_start_date:
           type: generate_dates
           start_dt: 2019-01-01
           end_dt: 2021-01-01
           freq: D
         period_end_date:
           type: row_apply
           list_of_values: accounts.period_start_date
           row_func: "lambda x: x + datetime.timedelta(days=random.randint(100, 365))"
    """
config = yaml.safe_load(yaml_string)

Given the following config:

In [3]:
print(yaml_string)


     accounts:
       num_rows: 10
       columns:
         id:
           type: generate_unique_id
           id_start_range: 0
           id_end_range: 10
           id_length: 4
         random:
           type: numpy_random
           distribution: binomial
           n: 1
           p: 0.5
         member_id:
           type: generate_unique_id
           prefix: mem_
           id_start_range: 0
           id_end_range: 5000
           id_length: 10
         period_start_date:
           type: generate_dates
           start_dt: 2019-01-01
           end_dt: 2021-01-01
           freq: D
         period_end_date:
           type: row_apply
           list_of_values: accounts.period_start_date
           row_func: "lambda x: x + datetime.timedelta(days=random.randint(100, 365))"
    


In [4]:
from data_fabricator.v0.core.fabricator import (
    MockDataGenerator,
)

mock_generator = MockDataGenerator(seed=1, instructions=config)
mock_generator.generate_all()

  from data_fabricator.v0.core.fabricator import (


## Generate Array columns
You can generate random array type columns using `generate_random_arrays` function. It allows the user to
generate the arrays using the `sample_values` provided. You can even allow duplicates using the flag `allow_duplicates`
and also set `length` param to generate fixed length array elements for the column.

Let's see how you can use this function in real setting:

In [5]:
import yaml

yaml_string = """
    reps:
        num_rows: 10
        columns:
            rep_id:
                type: generate_unique_id
                prefix: hcp
                id_start_range: 1
                id_end_range: 201
            territory_ids:
                type: generate_random_arrays
                sample_values: [1, 2, 3, 4]
                allow_duplicates: False
                length: 2
"""
config = yaml.safe_load(yaml_string)

Given the following config:

In [6]:
print(yaml_string)


    reps:
        num_rows: 10
        columns:
            rep_id:
                type: generate_unique_id
                prefix: hcp
                id_start_range: 1
                id_end_range: 201
            territory_ids:
                type: generate_random_arrays
                sample_values: [1, 2, 3, 4]
                allow_duplicates: False
                length: 2



Let's generate a column named `territory_ids` which keeps a check on all the territories that are associated
with that particular Rep. Let's say each hcp can only have two territories assigned to them. Also, there can't
be duplicate territory ids per Rep, so we keep the `allow_duplicates` as False.


In [7]:
from data_fabricator.v0.core.fabricator import MockDataGenerator

mock_generator = MockDataGenerator(instructions=config)
mock_generator.generate_all()

rep_df = mock_generator.all_dataframes["reps"]
print(rep_df["territory_ids"])

0    [2, 3]
1    [2, 4]
2    [3, 1]
3    [4, 3]
4    [1, 4]
5    [3, 1]
6    [3, 4]
7    [4, 3]
8    [2, 4]
9    [3, 4]
Name: territory_ids, dtype: object


## Generate Single date function
 This callable function to be used with `row_apply` that allows us to generate another column
 like start or end date. This won't generate an entire array of values to either down or upsample later.
 It generates single date based on the provided conditions.

 In the following example, we are trying to create interaction date for every customer.
 Generally the interaction happens after customer account has enrolled. So, we provided the condition the
 interaction date should be between account enroll date and 14 days from the enrolling time.


In [8]:
import yaml

yaml_string = """
    customers:
       num_rows: 10
       columns:
         acct_id:
           type: generate_unique_id
           id_start_range: 0
           id_end_range: 10
           id_length: 4
         acct_enroll_dt:
           type: generate_dates
           start_dt: 2019-01-01
           end_dt: 2020-12-31
           freq: B
           seed: 1

    interactions:
       num_rows: 10
       columns:
         interaction_id:
           type: generate_unique_id
           id_start_range: 0
           id_end_range: 10
           id_length: 2
         acct_id:
           type: row_apply
           list_of_values:
             - customers.acct_id
             - customers.acct_enroll_dt
           row_func: "lambda x, y: f'{x}'"
           seed: 1
         interaction_dt:
           type: row_apply
           list_of_values:
             - customers.acct_id
             - customers.acct_enroll_dt
           # Setting seed is not recommended for general use, please consider when to use seed
           row_func: "lambda x, y: generate_single_date(start_dt= y, end_dt= datetime.datetime.strptime(str(y.date()), '%Y-%m-%d') + datetime.timedelta(days=14), random_seed= 1)[0]"
           seed: 1
"""
config = yaml.safe_load(yaml_string)

Given the following config:

In [9]:
print(yaml_string)


    customers:
       num_rows: 10
       columns:
         acct_id:
           type: generate_unique_id
           id_start_range: 0
           id_end_range: 10
           id_length: 4
         acct_enroll_dt:
           type: generate_dates
           start_dt: 2019-01-01
           end_dt: 2020-12-31
           freq: B
           seed: 1

    interactions:
       num_rows: 10
       columns:
         interaction_id:
           type: generate_unique_id
           id_start_range: 0
           id_end_range: 10
           id_length: 2
         acct_id:
           type: row_apply
           list_of_values:
             - customers.acct_id
             - customers.acct_enroll_dt
           row_func: "lambda x, y: f'{x}'"
           seed: 1
         interaction_dt:
           type: row_apply
           list_of_values:
             - customers.acct_id
             - customers.acct_enroll_dt
           # Setting seed is not recommended for general use, please consider when to use seed
     

Let's generate single interaction date for each customer:

In [10]:
from data_fabricator.v0.core.fabricator import MockDataGenerator

# Setting seed is not recommended for general use, please consider when to use seed
mock_generator = MockDataGenerator(instructions=config)
mock_generator.generate_all()

customer_df = mock_generator.all_dataframes["customers"]
customer_df
interactions_df = mock_generator.all_dataframes["interactions"]
print(interactions_df)

  interaction_id acct_id interaction_dt
0             00    0000     2019-04-04
1             01    0001     2019-05-18
2             02    0002     2019-06-21
3             03    0003     2019-07-14
4             04    0004     2019-10-31
5             05    0005     2020-01-04
6             06    0006     2020-06-29
7             07    0007     2020-10-09
8             08    0008     2020-11-09
9             09    0009     2020-12-13


Let's check the interaction happens after the account has enrolled:
`acct_enroll_dt <= interaction_dt <= acct_enroll_dt + 14 days`

In [11]:
import pandas as pd

joined_df = pd.merge(customer_df, interactions_df, on="acct_id", how="left")
print(joined_df)

  acct_id acct_enroll_dt interaction_id interaction_dt
0    0000     2019-04-01             00     2019-04-04
1    0001     2019-05-15             01     2019-05-18
2    0002     2019-06-18             02     2019-06-21
3    0003     2019-07-11             03     2019-07-14
4    0004     2019-10-28             04     2019-10-31
5    0005     2020-01-01             05     2020-01-04
6    0006     2020-06-26             06     2020-06-29
7    0007     2020-10-06             07     2020-10-09
8    0008     2020-11-06             08     2020-11-09
9    0009     2020-12-10             09     2020-12-13


## Capture desirable rows from dataset

 This callable function to be used with `column_apply` that allows us to capture desirable subset of a dataset.
 The mechanism is to create a temp table that identifies which rows to keep and which to remove,
 and then quickly remove the rows that are not to be kept (flag column = false).

 In the following example, we are trying to capture only savings accounts.


In [12]:
import yaml

yaml_string = """
    account:
       num_rows: 10
       columns:
         acct_id:
           type: generate_unique_id
           id_start_range: 0
           id_end_range: 10
           id_length: 4
         acct_type:
           type: generate_values
           sample_values:
            - savings
            - current
         acct_start_dt:
           type: generate_dates
           start_dt: 2020-01-01
           end_dt: 2020-12-31
           freq: M
           seed: 1

    _temp_savings_account:
       num_rows: 10
       columns:
         acct_id:
           type: row_apply
           list_of_values: [account.acct_id, account.acct_start_dt]
           row_func: "lambda x,y: x"
         acct_start_dt:
           type: row_apply
           list_of_values: [account.acct_id, account.acct_start_dt]
           row_func: "lambda x,y: y"
         principal_amount:
           type: generate_random_numbers
           start_range: 30000
           end_range: 200000
         keep_row:
           type: row_apply
           list_of_values: account.acct_type
           row_func: "lambda x: x=='savings'"

    savings_account:
      columns:
         acct_id:
           type: column_apply
           list_of_values:
            - _temp_savings_account.acct_id
            - _temp_savings_account.principal_amount
            - _temp_savings_account.keep_row
            - _temp_savings_account.acct_start_dt
           column_func: drop_filtered_condition_rows
           column_func_kwargs:
              position: 0
         principal_amount:
           type: column_apply
           list_of_values:
            - _temp_savings_account.acct_id
            - _temp_savings_account.principal_amount
            - _temp_savings_account.keep_row
            - _temp_savings_account.acct_start_dt
           column_func: drop_filtered_condition_rows
           column_func_kwargs:
              position: 1
         acct_start_dt:
           type: column_apply
           list_of_values:
            - _temp_savings_account.acct_id
            - _temp_savings_account.principal_amount
            - _temp_savings_account.keep_row
            - _temp_savings_account.acct_start_dt
           column_func: drop_filtered_condition_rows
           column_func_kwargs:
              position: 3
"""
config = yaml.safe_load(yaml_string)

Given the following config:

In [13]:
print(yaml_string)


    account:
       num_rows: 10
       columns:
         acct_id:
           type: generate_unique_id
           id_start_range: 0
           id_end_range: 10
           id_length: 4
         acct_type:
           type: generate_values
           sample_values:
            - savings
            - current
         acct_start_dt:
           type: generate_dates
           start_dt: 2020-01-01
           end_dt: 2020-12-31
           freq: M
           seed: 1

    _temp_savings_account:
       num_rows: 10
       columns:
         acct_id:
           type: row_apply
           list_of_values: [account.acct_id, account.acct_start_dt]
           row_func: "lambda x,y: x"
         acct_start_dt:
           type: row_apply
           list_of_values: [account.acct_id, account.acct_start_dt]
           row_func: "lambda x,y: y"
         principal_amount:
           type: generate_random_numbers
           start_range: 30000
           end_range: 200000
         keep_row:
           type: row

Let's generate savings account table:

In [14]:
from data_fabricator.v0.core.fabricator import MockDataGenerator

# Setting seed is not recommended for general use, please consider when to use seed
mock_generator = MockDataGenerator(instructions=config)
mock_generator.generate_all()

account_df = mock_generator.all_dataframes["account"]
print(account_df)

temp_savings_account_df = mock_generator.all_dataframes["_temp_savings_account"]
print(temp_savings_account_df)

savings_account_df = mock_generator.all_dataframes["savings_account"]
print(savings_account_df)

  acct_id acct_type acct_start_dt
0    0000   current    2020-01-31
1    0001   current    2020-02-29
2    0002   savings    2020-03-31
3    0003   savings    2020-04-30
4    0004   current    2020-05-31
5    0005   savings    2020-06-30
6    0006   savings    2020-07-31
7    0007   savings    2020-09-30
8    0008   savings    2020-10-31
9    0009   current    2020-11-30
  acct_id acct_start_dt  principal_amount  keep_row
0    0000    2020-01-31      45956.129752     False
1    0001    2020-02-29      34819.071009     False
2    0002    2020-03-31     172080.067666      True
3    0003    2020-04-30     103570.401544      True
4    0004    2020-05-31     159587.614018     False
5    0005    2020-06-30       30358.02907      True
6    0006    2020-07-31     105715.822989      True
7    0007    2020-09-30     152661.805498      True
8    0008    2020-10-31      68889.577616      True
9    0009    2020-11-30     190696.018244     False
  acct_id  principal_amount acct_start_dt
0    0002   

## Generate and capture conditional weight values.
 This functionality helps us to provide a new distribution set of values based on the weights provided.

 In the following example, we are trying to generate set of channel codes for each product.


In [15]:
import yaml

yaml_string = """
     customers:
        num_rows: 10
        columns:
            hcp_id:
                type: generate_unique_id
                prefix: hcp
                id_start_range: 0
                id_end_range: 10
            hcp_name:
                type: faker
                provider: name
                # Setting seed is not recommended for general use, please consider when to use seed
                faker_seed: 1

     interactions:
        columns:
            hcp_id:
                type: row_apply
                list_of_values: customers.hcp_id
                row_func: "lambda x : x"
            product_cd:
                type: generate_values
                sample_values: ["R03AC02","R03AC03"]
            channel_cd:
                type: row_apply
                list_of_values: interactions.product_cd
                row_func: conditional_generate_from_weights
                row_func_kwargs:
                    dependent_weights:
                        R03AC02:
                            a: 10
                            b: 1
                        R03AC03:
                            c: 6
                            d: 5
"""
config = yaml.safe_load(yaml_string)

Given the following config:

In [16]:
print(yaml_string)


     customers:
        num_rows: 10
        columns:
            hcp_id:
                type: generate_unique_id
                prefix: hcp
                id_start_range: 0
                id_end_range: 10
            hcp_name:
                type: faker
                provider: name
                # Setting seed is not recommended for general use, please consider when to use seed
                faker_seed: 1

     interactions:
        columns:
            hcp_id:
                type: row_apply
                list_of_values: customers.hcp_id
                row_func: "lambda x : x"
            product_cd:
                type: generate_values
                sample_values: ["R03AC02","R03AC03"]
            channel_cd:
                type: row_apply
                list_of_values: interactions.product_cd
                row_func: conditional_generate_from_weights
                row_func_kwargs:
                    dependent_weights:
                        R03AC02:
       

Let's generate channel codes for each interaction based on the weights provided:

In [17]:
from data_fabricator.v0.core.fabricator import MockDataGenerator

# Setting seed is not recommended for general use, please consider when to use seed
mock_generator = MockDataGenerator(instructions=config)
mock_generator.generate_all()

customers_df = mock_generator.all_dataframes["customers"]
print(customers_df)

interactions_df = mock_generator.all_dataframes["interactions"]
print(interactions_df)

  hcp_id          hcp_name
0   hcp0    Ryan Gallagher
1   hcp1          Jon Cole
2   hcp2      Rachel Davis
3   hcp3  Russell Reynolds
4   hcp4     April Griffin
5   hcp5    Crystal Landry
6   hcp6    Amanda Johnson
7   hcp7      Teresa James
8   hcp8    Javier Johnson
9   hcp9   Jeffrey Simpson
  hcp_id product_cd channel_cd
0   hcp0    R03AC03          c
1   hcp1    R03AC02          a
2   hcp2    R03AC02          a
3   hcp3    R03AC03          c
4   hcp4    R03AC03          c
5   hcp5    R03AC02          a
6   hcp6    R03AC02          a
7   hcp7    R03AC02          a
8   hcp8    R03AC02          a
9   hcp9    R03AC02          a


## Pass relative parameters in explode
 This functionality helps us to pass a column to the start or end date parameters of explode function.
 It generates an array of values between those start and end date range.

 For example, in retail banking, a customer is expected to have a monthly transaction statement for each of their accounts,
 but these transaction statements only make sense to exist from the start date of the account not before.

 In the following example, we are trying to create transaction date for every customer
 So, we provided a condition i.e, the transaction happens from the time when account has created/started.


In [18]:
import yaml

yaml_string = """
     accounts:
       num_rows: 10
       columns:
         acct_id:
           type: generate_unique_id
           id_start_range: 0
           id_end_range: 10
           id_length: 4
         acct_start_dt:
           type: generate_dates
           start_dt: 2020-01-01
           end_dt: 2020-12-31
           freq: M
           seed: 1

     transaction_statements:
       num_rows: 20
       columns:
         transaction_id:
           type: generate_unique_id
           id_start_range: 00
           id_end_range: 20
           id_length: 6
         acct_id:
           type: explode
           list_of_values:
             - accounts.acct_id
             - accounts.acct_start_dt
           explode_func: generate_dates
           explode_func_kwargs:
            start_dt: list_of_values[1]
            end_dt: 2020-12-31
            freq: M
            num_rows: 2
           position: 0
         transaction_dt:
           type: explode
           list_of_values:
             - accounts.acct_id
             - accounts.acct_start_dt
           explode_func: generate_dates
           explode_func_kwargs:
            start_dt: list_of_values[1]
            end_dt: 2020-12-31
            freq: M
            num_rows: 2
           position: 2
"""
config = yaml.safe_load(yaml_string)

Given the following config:

In [19]:
print(yaml_string)


     accounts:
       num_rows: 10
       columns:
         acct_id:
           type: generate_unique_id
           id_start_range: 0
           id_end_range: 10
           id_length: 4
         acct_start_dt:
           type: generate_dates
           start_dt: 2020-01-01
           end_dt: 2020-12-31
           freq: M
           seed: 1

     transaction_statements:
       num_rows: 20
       columns:
         transaction_id:
           type: generate_unique_id
           id_start_range: 00
           id_end_range: 20
           id_length: 6
         acct_id:
           type: explode
           list_of_values:
             - accounts.acct_id
             - accounts.acct_start_dt
           explode_func: generate_dates
           explode_func_kwargs:
            start_dt: list_of_values[1]
            end_dt: 2020-12-31
            freq: M
            num_rows: 2
           position: 0
         transaction_dt:
           type: explode
           list_of_values:
             - accoun

Let's generate transaction date for each customer:

In [20]:
from data_fabricator.v0.core.fabricator import MockDataGenerator

# Setting seed is not recommended for general use, please consider when to use seed
mock_generator = MockDataGenerator(instructions=config)
mock_generator.generate_all()

accounts_df = mock_generator.all_dataframes["accounts"]
print(accounts_df)
transactions_df = mock_generator.all_dataframes["transaction_statements"]
print(transactions_df)

  acct_id acct_start_dt
0    0000    2020-01-31
1    0001    2020-02-29
2    0002    2020-03-31
3    0003    2020-04-30
4    0004    2020-05-31
5    0005    2020-06-30
6    0006    2020-07-31
7    0007    2020-09-30
8    0008    2020-10-31
9    0009    2020-11-30
   transaction_id acct_id transaction_dt
0          000000    0000     2020-07-31
1          000001    0000     2020-11-30
2          000002    0001     2020-05-31
3          000003    0001     2020-08-31
4          000004    0002     2020-03-31
5          000005    0002     2020-11-30
6          000006    0003     2020-07-31
7          000007    0003     2020-11-30
8          000008    0004     2020-09-30
9          000009    0004     2020-12-31
10         000010    0005     2020-07-31
11         000011    0005     2020-08-31
12         000012    0006     2020-08-31
13         000013    0006     2020-12-31
14         000014    0007     2020-10-31
15         000015    0007     2020-12-31
16         000016    0008     2020-10-3

Let's check the transaction happens after the account has created:
`acct_start_dt <= transaction_dt`

In [21]:
import pandas as pd

joined_df = pd.merge(accounts_df, transactions_df, on="acct_id", how="left")
print(joined_df)

   acct_id acct_start_dt transaction_id transaction_dt
0     0000    2020-01-31         000000     2020-07-31
1     0000    2020-01-31         000001     2020-11-30
2     0001    2020-02-29         000002     2020-05-31
3     0001    2020-02-29         000003     2020-08-31
4     0002    2020-03-31         000004     2020-03-31
5     0002    2020-03-31         000005     2020-11-30
6     0003    2020-04-30         000006     2020-07-31
7     0003    2020-04-30         000007     2020-11-30
8     0004    2020-05-31         000008     2020-09-30
9     0004    2020-05-31         000009     2020-12-31
10    0005    2020-06-30         000010     2020-07-31
11    0005    2020-06-30         000011     2020-08-31
12    0006    2020-07-31         000012     2020-08-31
13    0006    2020-07-31         000013     2020-12-31
14    0007    2020-09-30         000014     2020-10-31
15    0007    2020-09-30         000015     2020-12-31
16    0008    2020-10-31         000016     2020-10-31
17    0008

## Sorting data functionality in `generate_dates/generate_values`
  This function is useful to sort the values because not all dates/values should be sequential for pseudo-real data.

  For example: Join date of clients can be sequential, since their IDs would be
  sequential as well. But the close dates of a list of accounts can be entirely random.

 In the following example, we are trying to create account start and end date for each customer.


In [22]:
import yaml

yaml_string = """
     accounts:
       num_rows: 10
       columns:
         acct_id:
           type: generate_unique_id
           id_start_range: 0
           id_end_range: 10
           id_length: 4
         acct_start_dt:
           type: generate_dates
           start_dt: 2019-01-01
           end_dt: 2019-12-31
           freq: M
           seed: 1
         acct_end_dt:
           type: generate_dates
           start_dt: 2020-01-01
           end_dt: 2020-12-31
           freq: M
           seed: 1
           sort_dates: False
"""
config = yaml.safe_load(yaml_string)

Given the following config:

In [23]:
print(yaml_string)


     accounts:
       num_rows: 10
       columns:
         acct_id:
           type: generate_unique_id
           id_start_range: 0
           id_end_range: 10
           id_length: 4
         acct_start_dt:
           type: generate_dates
           start_dt: 2019-01-01
           end_dt: 2019-12-31
           freq: M
           seed: 1
         acct_end_dt:
           type: generate_dates
           start_dt: 2020-01-01
           end_dt: 2020-12-31
           freq: M
           seed: 1
           sort_dates: False



Let's generate account start and end dates for each account:

In [24]:
from data_fabricator.v0.core.fabricator import MockDataGenerator

# Setting seed is not recommended for general use, please consider when to use seed
mock_generator = MockDataGenerator(instructions=config)
mock_generator.generate_all()

accounts_df = mock_generator.all_dataframes["accounts"]
print(accounts_df)

  acct_id acct_start_dt acct_end_dt
0    0000    2019-01-31  2020-03-31
1    0001    2019-02-28  2020-10-31
2    0002    2019-03-31  2020-02-29
3    0003    2019-04-30  2020-05-31
4    0004    2019-05-31  2020-11-30
5    0005    2019-06-30  2020-04-30
6    0006    2019-07-31  2020-07-31
7    0007    2019-09-30  2020-06-30
8    0008    2019-10-31  2020-09-30
9    0009    2019-11-30  2020-01-31


## Generate Cross Product

 This callable function to be used with `column_apply` that allows us to generate a dataset
 which is a cross product of all the values passed in the tables created in the configuration

 In the following example, we are generating all possible combinations of customer ID,
 product ID and dates:

In [25]:
import yaml

yaml_string = """
            customers:
              num_rows: 3
              columns:
                customer_id:
                  type: generate_unique_id
                  prefix: customer_

            products:
              num_rows: 2
              columns:
                product_id:
                  type: generate_unique_id
                  prefix: product_

            dates:
              num_rows: 2
              columns:
                date:
                  type: generate_dates
                  start_dt: 2020-01-01
                  end_dt: 2020-01-02
                  freq: D

            cross_product:
              columns:
                customer_id:
                  type: column_apply
                  check_all_inputs_same_length: False
                  list_of_values:
                    - customers.customer_id
                    - products.product_id
                    - dates.date
                  column_func: cross_product
                  column_func_kwargs:
                    position: 0
                product_id:
                  type: column_apply
                  resize: True
                  check_all_inputs_same_length: False
                  list_of_values:
                    - customers.customer_id
                    - products.product_id
                    - dates.date
                  column_func: cross_product
                  column_func_kwargs:
                    position: 1
                date:
                  type: column_apply
                  check_all_inputs_same_length: False
                  list_of_values:
                    - customers.customer_id
                    - products.product_id
                    - dates.date
                  column_func: cross_product
                  column_func_kwargs:
                    position: 2
"""
config = yaml.safe_load(yaml_string)

Given the following config:

In [26]:
print(yaml_string)


            customers:
              num_rows: 3
              columns:
                customer_id:
                  type: generate_unique_id
                  prefix: customer_

            products:
              num_rows: 2
              columns:
                product_id:
                  type: generate_unique_id
                  prefix: product_

            dates:
              num_rows: 2
              columns:
                date:
                  type: generate_dates
                  start_dt: 2020-01-01
                  end_dt: 2020-01-02
                  freq: D

            cross_product:
              columns:
                customer_id:
                  type: column_apply
                  check_all_inputs_same_length: False
                  list_of_values:
                    - customers.customer_id
                    - products.product_id
                    - dates.date
                  column_func: cross_product
                  column_func_kwargs:
  

Let's generate the cross product for the above config:

In [27]:
from data_fabricator.v0.core.fabricator import MockDataGenerator

# Setting seed is not recommended for general use, please consider when to use seed
mock_generator = MockDataGenerator(instructions=config)
mock_generator.generate_all()
cross_prod = mock_generator.all_dataframes["cross_product"]
customer_df = mock_generator.all_dataframes["customers"]
products_df = mock_generator.all_dataframes["products"]
dates_df = mock_generator.all_dataframes["dates"]
cross_prod = mock_generator.all_dataframes["cross_product"]

Here are the customer IDs:

In [28]:
print(customer_df)

  customer_id
0  customer_1
1  customer_2
2  customer_3


Here are the product IDs:

In [29]:
print(products_df)

  product_id
0  product_1
1  product_2


Here are the dates:

In [30]:
print(dates_df)

        date
0 2020-01-01
1 2020-01-02


Here is the cross products:

In [31]:
print(cross_prod)

   customer_id product_id       date
0   customer_1  product_1 2020-01-01
1   customer_1  product_1 2020-01-02
2   customer_1  product_2 2020-01-01
3   customer_1  product_2 2020-01-02
4   customer_2  product_1 2020-01-01
5   customer_2  product_1 2020-01-02
6   customer_2  product_2 2020-01-01
7   customer_2  product_2 2020-01-02
8   customer_3  product_1 2020-01-01
9   customer_3  product_1 2020-01-02
10  customer_3  product_2 2020-01-01
11  customer_3  product_2 2020-01-02
