In [None]:
%%capture

%pip install git+https://github.com/gretelai/gretel-python-client.git@main

In [None]:
from gretel_client.navigator_client import Gretel
from rich.console import Console

gretel = Gretel(api_key="prompt", endpoint="https://api.dev.gretel.ai")
console = Console()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

ds = "/content/drive/My Drive/hipaa_patients.csv"

import pandas as pd
#ds = "https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/sample_data/sample-patient-events.csv"
df = pd.read_csv(ds)

print(f"Number of rows: {len(df)}")
df.head()

In [None]:
hipaa_safe_config_yaml = """
globals:
  classify:
    enable: true
    entities:
      - first_name
      - last_name
      - name
      - street_address
      - city
      - state
      - postcode
      - country
      - address
      - latitude
      - longitude
      - coordinate
      - age
      - phone_number
      - fax_number
      - email
      - ssn
      - unique_identifer
      - medical_record_number
      - health_plan_beneficiary_number
      - account_number
      - certificate_license_number
      - vehicle_identifier
      - license_plate
      - device_identifier
      - biometric_identifier
      - url
      - ipv4
      - ipv6
      - date
  ner:
    ner_threshold: 0.3
  locales: [en_US]
steps:
  - vars:
      row_seed: random.random()
    rows:
      update:
        - condition: column.entity == "first_name" and not (this | isna)
          value: fake.persona(row_index=vars.row_seed + index).first_name
        - condition: column.entity == "last_name" and not (this | isna)
          value: fake.persona(row_index=vars.row_seed + index).last_name
        - condition: column.entity == "name" and not (this | isna)
          value: column.entity | fake
        - condition: (column.entity == "street_address" or column.entity == "city" or column.entity == "state" or column.entity == "postcode" or column.entity == "address") and not (this | isna)
          value: column.entity | fake
        - condition: column.entity == "latitude" and not (this | isna)
          value: fake.location_on_land()[0]
        - condition: column.entity == "longitude" and not (this | isna)
          value: fake.location_on_land()[1]
        - condition: column.entity == "coordinate" and not (this | isna)
          value: fake.location_on_land()
        - condition: column.entity == "email" and not (this | isna)
          value: fake.persona(row_index=vars.row_seed + index).email
        - condition: column.entity == "ssn" and not (this | isna)
          value: column.entity | fake
        - condition: column.entity == "phone_number" and not (this | isna)
          value: (fake.random_number(digits=3) | string) + "-" + (fake.random_number(digits=3) | string) + "-" + (fake.random_number(digits=4) | string)
        - condition: column.entity == "fax_number" and not (this | isna)
          value: (fake.random_number(digits=3) | string) + "-" + (fake.random_number(digits=3) |
            string) + "-" + (fake.random_number(digits=4) | string)
        - condition: column.entity == "vehicle_identifier" and not (this | isna)
          value: fake.vin()
        - condition: column.entity == "license_plate" and not (this | isna)
          value: column.entity | fake
        - condition: (column.entity == "medical_record_number" or column.entity == "health_plan_beneficiary_number" or column.entity == "account_number" or column.entity == "certificate_license_number" or column.entity == "device_identifier" or column.entity == "biometric_identifier") and not (this | isna)
          value: fake.bothify(re.sub("\\d", "#", re.sub("[A-Z]", "?", (this | string))))
        - condition: (column.entity == "url" or column.entity == "ipv4" or column.entity == "ipv6") and not (this | isna)
          value: column.entity | fake
        - condition: column.entity is none and column.type == "text"
          value: this | fake_entities
"""


In [None]:
tabular_ft_config = {
    "train": {
        "params": {
            "num_input_records_to_sample": 10000
        },
        "privacy_params": {
            "dp": "false"
        }
    }
}


import yaml

synthetic_dataset = gretel.safe_synthetic_dataset\
    .from_data_source(df) \
    .transform(yaml.safe_load(hipaa_safe_config_yaml)) \
    .synthesize("tabular_ft", tabular_ft_config, num_records=1000) \
    .create()

In [None]:
synthetic_dataset.dataset.df.head()

In [None]:
synthetic_dataset.report.table

In [None]:
synthetic_dataset.get_step_output("transform").df

In [None]:
import IPython
IPython.display.HTML(str(synthetic_dataset.download_report(format="html").read().decode('utf-8')), metadata=dict(isolated=True))