In [1]:
import datetime
import pandas as pd
import numpy as np
from faker import Faker

from exhibit import exhibit as xbt
from exhibit.core.spec import Spec, UUIDColumn, CategoricalColumn, DateColumn

### User-defined functions with custom logic to generate categorical and date values
When using Exhibit as an importable library and building the specification using Python objects, you can use custom functions in place of the `anonymising_set` attribute for categorical and date columns. This feature provides a lot of flexibility when it comes to data generation and provides a quick and easy way to augment the dataset without adding custom ML models. Custom functions have only two restrictions - they must define an argument for dataset row (even if it's unused) and return a single value. 

#### Basic example using dates

In [2]:
def increment_date(row):
    '''
    Basic function to increase a date by a random number between 1 and 10

    Parameters
    ----------
    row : pd.Series
        the anonymising_set function return one value at a time
        and has access to the current row in the DF generated so far.
        This argument is mandatory to include, even if it's unused.

    Returns
    ----------
    Scalar value
    '''
    
    rng = np.random.default_rng()

    # note the use of row argument to get the value in the date column that was already in the generated dataset
    # columns are generated in the order they appear (or are added) in the spec so if we reversed the order, we
    # wouldn't have been able to access the "date" column. Similarly, numerical and geo columns are generated after
    # the categorical ones so those values are also not yet 
    cur_date = row["date"]
    new_date = cur_date + datetime.timedelta(days=int(rng.integers(1, 10)))

    return new_date

In [3]:
spec = Spec()
spec_dict = spec.generate()

spec_dict["metadata"]["number_of_rows"] = 100
spec_dict["metadata"]["date_columns"] = ["date", "future_date"]
spec_dict["metadata"]["id"] = "main"

spec_dict["columns"]["date"]        = DateColumn("date", uniques=200, from_date="2023-01-01", cross_join=False)
spec_dict["columns"]["future_date"] = DateColumn("future_date", uniques=200, cross_join=False, anonymising_set=increment_date)

exhibit_data = xbt.Exhibit(command="fromspec", source=spec_dict, output="dataframe")
anon_df = exhibit_data.generate()

In [4]:
anon_df

Unnamed: 0,date,future_date
0,2023-02-20,2023-02-28
1,2023-01-17,2023-01-20
2,2023-02-25,2023-03-04
3,2023-01-09,2023-01-14
4,2023-02-15,2023-02-21
...,...,...
95,2023-01-06,2023-01-13
96,2023-03-17,2023-03-21
97,2023-06-22,2023-06-27
98,2023-03-13,2023-03-20


#### Using external libraries to generate realistic data
Faker is a well-known library with fake data used for testing purposes. It has a number of providers and fake datasets. In this example, we'll use Faker to generate the name and address details and augment them with a unique ID and a smoker attribute using Exhibit.

In [5]:
fake = Faker()
fake.name()

'Arthur Washington'

In [6]:
# rememeber that we need to include a function argument for the dataset row, even if it's unused.
# under the hood, Exhibit will attempt to pass the function to Pandas' apply so if you don't include
# a placeholder argument, you will get an error.
def fake_name(_):
    return fake.name()

def fake_address(_):
    return fake.address()

In [7]:
spec = Spec()
spec_dict = spec.generate()

spec_dict["metadata"]["number_of_rows"] = 100
spec_dict["metadata"]["uuid_columns"] = ["id"]
spec_dict["metadata"]["categorical_columns"] = ["name", "address", "smoker"]
spec_dict["metadata"]["id"] = "main"

smoker_data = pd.DataFrame(data={
    "smoker":             ["Y", "N", "No Answer", "Missing Data"],
    "probability_vector": [0.2, 0.7, 0.1, 0]
})

spec_dict["columns"]["id"]      = UUIDColumn(anon_set="range")
spec_dict["columns"]["name"]    = CategoricalColumn("name", uniques=100, original_values=None, anon_set=fake_name)
spec_dict["columns"]["address"] = CategoricalColumn("address", uniques=100, original_values=None, anon_set=fake_address, miss_proba=0.1)
spec_dict["columns"]["smoker"]  = CategoricalColumn("smoker", uniques=3, original_values=smoker_data)

exhibit_data = xbt.Exhibit(command="fromspec", source=spec_dict, output="dataframe")
anon_df = exhibit_data.generate()

In [8]:
anon_df

Unnamed: 0,id,name,address,smoker
0,22,Mckenzie Cruz,"762 Baker Point\nPort Kevin, MN 42282",N
1,64,Michael Williams,"481 Madison Fords\nNew Donnaview, CO 27959",N
2,11,Jeanne Smith,,Y
3,46,Joanna Franklin,,No Answer
4,82,Daniel Martinez,"479 Jean Falls Suite 185\nDeanbury, WV 72875",N
...,...,...,...,...
95,5,Jack Harrison,Unit 0802 Box 5382\nDPO AP 08329,N
96,40,Courtney Sanchez,"04326 Wallace Circles\nNorth Anthonybury, IN 8...",N
97,56,Thomas Anderson,"28821 Clark Drive Apt. 170\nPort John, CO 44092",N
98,76,Julie Flowers,"74248 Ball Land Apt. 027\nPowersfurt, RI 70556",N
