# Example Transform V2 Configuration

In this blueprint, we will use Gretel's Transform V2 model to apply a variety of Privacy Enhancing Techniques (PETs) to a dataset.


## 💾  Install Gretel SDK

In [2]:
%%capture

!pip install -U gretel-client

## 🌐 Configure your Gretel Session


In [3]:
# Specify your Gretel API key

from gretel_client import configure_session

configure_session(api_key="prompt",
                  cache="yes",
                  validate=True
                  )


Gretel API Key: ··········
Caching Gretel config to disk.
Using endpoint https://api.gretel.cloud
Logged in as carlyn@gretel.ai ✅


## 🔬 Preview input data

In [18]:
import pandas as pd
from io import StringIO

# Set the option to display more columns
pd.set_option('display.max_columns', None)

# Set the option to display wide columns
pd.set_option('max_colwidth', None)

# CSV data
csv_data = """first_name,last_name,full_name,suffix,date_of_birth,ssn,social_security_number,minimum_payment_amount,credit_limit,marital_status,full_address,address,city,postal_code,state,phone_number,email,loan_application_id,loan_id,bank_account_number,vehicle_identifier,license_plate,device_serial_number,url,ip_address,image,unique_identifier
John,Smith,John Smith,III,2010-02-16,123-45-6789,123-45-6789,500,5000.00,single,"48764 Howard Forge Apt. 421\nVanessaside, VT 79393",48764 Howard Forge Apt. 421,Vanessaside,79393,VT,123-456-7890,john.smith@gmail.com,A-1234,ABC12345,MYNB48764759382421,RT3GZYSKXXNDZ9J97,487-YNB,A1B2-C3D4-E5F6-7890,https://green.info/,171.174.170.81,https://picsum.photos/788/861,e3e70682-c209-4cac-a29f-6fbed82c07cd
Ada,Lovelace,Ada Lovelace,,1815-12-10,012-34-5678,012-34-5678,1000,10000.00,married,"778 Brown Plaza\nNorth Jenniferfurt, VT 88077",778 Brown Plaza,Jenniferfurt,88077,VT,234-567-8901,ada.lovelace@gmail.com,B-2345,DEF67890,TZIR92411578156593,G0K75MX77NULDXVG4,ZJ9 P3L,1A2B-3C4D-5E6F-7A8B,https://howard-snow.com/,95.25.112.121,https://dummyimage.com/530x995,f728b4fa-4248-4e3a-8a5d-2f346baa9455
Eric,Jones,Eric Jones,PhD,1999-07-05,234-56-7890,234-56-7890,1500,25000.00,married,"513 John Divide Suite 115\nRodriguezside, LA 93111",513 John Divide Suite 11,Rodriguezside,93111,LA,345-678-9012,eric.jones@yahoo.com,C-3456,HIJ09876,RPOQ40801609753513,16S1YMFL25CEF0V66,194-EJEY,9F8E-7D6C-5B4A-3C2D,http://campos.com/,51.105.121.194,https://dummyimage.com/621x976,eb1167b3-67a9-4378-bc65-c1e582e2e662
Grace,Hopper,Grace Hopper,,1906-12-09,456-78-9012,456-78-9012,2000,50000.00,single,"98 Wallace Ranch Suite 593\nIvanburgh, AZ 80818",98 Wallace Ranch Suite 593,Ivanburgh,80818,AZ,456-789-0123,grace.hopper@gmail.com,D-4567,KLM54321,SHHZ28711587148418,LZY7KJ8M0DJV6RLFJ,WZT-241,4E5F-6A7B-8C9D-0E1F,http://blair.com/,195.110.164.126,https://dummyimage.com/447x285,f7c1bd87-4da5-4709-9471-3d60c8a70639
"""

# Use StringIO to simulate reading from a file
data = StringIO(csv_data)

# Create DataFrame
df = pd.read_csv(data)

# Display DataFrame
df.head()

Unnamed: 0,first_name,last_name,full_name,suffix,date_of_birth,ssn,social_security_number,minimum_payment_amount,credit_limit,marital_status,full_address,address,city,postal_code,state,phone_number,email,loan_application_id,loan_id,bank_account_number,vehicle_identifier,license_plate,device_serial_number,url,ip_address,image,unique_identifier
0,John,Smith,John Smith,III,2010-02-16,123-45-6789,123-45-6789,500,5000.0,single,"48764 Howard Forge Apt. 421\nVanessaside, VT 79393",48764 Howard Forge Apt. 421,Vanessaside,79393,VT,123-456-7890,john.smith@gmail.com,A-1234,ABC12345,MYNB48764759382421,RT3GZYSKXXNDZ9J97,487-YNB,A1B2-C3D4-E5F6-7890,https://green.info/,171.174.170.81,https://picsum.photos/788/861,e3e70682-c209-4cac-a29f-6fbed82c07cd
1,Ada,Lovelace,Ada Lovelace,,1815-12-10,012-34-5678,012-34-5678,1000,10000.0,married,"778 Brown Plaza\nNorth Jenniferfurt, VT 88077",778 Brown Plaza,Jenniferfurt,88077,VT,234-567-8901,ada.lovelace@gmail.com,B-2345,DEF67890,TZIR92411578156593,G0K75MX77NULDXVG4,ZJ9 P3L,1A2B-3C4D-5E6F-7A8B,https://howard-snow.com/,95.25.112.121,https://dummyimage.com/530x995,f728b4fa-4248-4e3a-8a5d-2f346baa9455
2,Eric,Jones,Eric Jones,PhD,1999-07-05,234-56-7890,234-56-7890,1500,25000.0,married,"513 John Divide Suite 115\nRodriguezside, LA 93111",513 John Divide Suite 11,Rodriguezside,93111,LA,345-678-9012,eric.jones@yahoo.com,C-3456,HIJ09876,RPOQ40801609753513,16S1YMFL25CEF0V66,194-EJEY,9F8E-7D6C-5B4A-3C2D,http://campos.com/,51.105.121.194,https://dummyimage.com/621x976,eb1167b3-67a9-4378-bc65-c1e582e2e662
3,Grace,Hopper,Grace Hopper,,1906-12-09,456-78-9012,456-78-9012,2000,50000.0,single,"98 Wallace Ranch Suite 593\nIvanburgh, AZ 80818",98 Wallace Ranch Suite 593,Ivanburgh,80818,AZ,456-789-0123,grace.hopper@gmail.com,D-4567,KLM54321,SHHZ28711587148418,LZY7KJ8M0DJV6RLFJ,WZT-241,4E5F-6A7B-8C9D-0E1F,http://blair.com/,195.110.164.126,https://dummyimage.com/447x285,f7c1bd87-4da5-4709-9471-3d60c8a70639


## 󠁘🟰 Define TV2 Configuration

In [23]:
# Example Tranform V2 configuration
config = """schema_version: "1.0"
name: "my-tv2-config"
models:
  - transform_v2:
      globals:
        classify:
          enable: false
          entities: []
        locales: [en_US]
      steps:
        - rows:
            update:
              # (1) Replace first name
              #     Replaces a first name with a synthetic first name.
              #     E.g. Andrew -> Aamir
              - name: first_name
                value: fake.first_name()
              # (2) Replace last name
              #     Replaces a last name with a synthetic last name.
              #     E.g. Johnson -> Khan
              - name: last_name
                value: fake.last_name()
              # (3) Replace full name consistently
              #     Replaces full name with syntehtic name dervied from transformed first and last name columns for consistency.
              #     E.g. Andrew Johnson -> Aamir Khan
              - name: full_name
                value: row.first_name ~ " " ~ row.last_name # if consistency is not requried, use value: fake.name()
              # (4) Replace nullable value while retaining nulls.
              #     Replaces value only if not null. Nulls are retained.
              #     E.g. PhD -> III
              #.    E.g. null -> null
              - condition: column.name == "suffix" and not (this | isna) # if suffix is not null, replace with synthetic suffix
                value: fake.suffix()
              # (5) Shift date
              #     Randomly shifts a date within a specified interval.
              #     E.g. 02/01/2000 -> 01/29/2000
              - name: date_of_birth
                value: this | date_shift("-7d", "+7d")
              # (6) Trim value
              #     Trim value in field to specific length.
              #     E.g. 123-45-6789 -> 6789
              - name: ssn
                value: this[-4:]
              # (7) Replace social security number
              #     Replace social security number with synthetic social security number.
              #     E.g. 123-45-6789 -> 456-78-9012
              - name: social_security_number
                value: fake.ssn()
              # (8) Redact partially
              #     Redact a portion of a value.
              #     E.g. 1234-5678-9012 -> 1234-XXXX-9012
              - name: minimum_payment_amount
                value: fake.pyint(1, 99999)
              # (9) Replace with random float
              #      Replaces the value with a random float (decimal) value within a specified range.
              #      E.g. 10000.41 -> 11000.23
              - name: credit_limit
                value: fake.pyfloat(right_digits=2, min_value=1.00, max_value=99999.00)
              # (10) Redact
              #      Redacts value
              #      E.g. married -> XXXXX
              - name: marital_status
                value: "XXXXX"
              # (11) Replace full address
              #      Replaces full address with synthetic full address.
              #      E.g. 48764 Howard Forge Apt. 421 Vanessaside, VT 79393 -> 778 Brown Plaza North Jenniferfurt, VT 88077
              - name: full_address
                value: fake.address()
              # (12) Replace street address
              #      Replaces street address with a street synthetic address.
              #      E.g. 487 Hull Village Suite 759 -> 242 Christine Glen
              - name: address
                value: fake.street_address()
              # (13) Replace city
              #      Replaces city with a synthetic city.
              #      E.g. Fort Collins -> Hullport
              - name: city
                value: fake.city()
              # (14) Replace postal code
              #      Replaces postal code with a synthetic postal code.
              #      E.g. 02145 -> 56789
              - name: postal_code
                value: fake.postalcode()
              # (15) Replace U.S. state abbreviation
              #      Replaces U.S. state abbreviation with synthetic U.S. state abbreviation.
              #      E.g. MA -> CA
              - name: state
                value: fake.state_abbr(include_territories=False, include_freely_associated_states=False)
              # (16) Replace phone number
              #      Replaces phone number with synthetic phone number.
              #      E.g. 123-456-7890 ->098-765-4321
              - name: phone_number
                value: fake.basic_phone_number()
              # (17) Replace email address
              #     Replaces email address with synthetic email address.
              #     E.g. andrew.johnson@gmail.com -> gwilliams@yahoo.com
              - name: email
                value: fake.ascii_email()
              # (18) Replace ID
              #     Replaces ID with synthetic ID of specified format
              #     E.g. A-1234 -> B-5678
              - name: loan_application_id
                value: fake.bothify(text="?-####")
              # (19) Generate unique entities
              #      Generates unique entities based on a unique identifier.
              #      E.g. Generate a loan ID that corresponds uniquely to an application loan ID
              - name: loan_id
                value: fake(seed=row.loan_application_id).bothify(text="???#####") # For every loan_application_id, a unique loan_id will be generated.
              # (20) Replace bank account number
              #     Replaces bank account number with synthetic bank account number.
              #     E.g. 1234567890 -> 0987654321
              - name: bank_account_number
                value: fake.bban() # bban = basic bank account number
              # (21) Replace vehicle identifier
              #     Replaces vehicle identifier with synthetic vehicle identifier.
              #     RT3GZYSKXXNDZ9J97 -> G0K75MX77NULDXVG4
              - name: vehicle_identifier
                value: fake.vin()
              # (22) Replace license plate
              #     Replaces license plate with synthetic license plate.
              #     E.g 487-YNB -> ZJ9 P3L
              - name: license_plate
                value: fake.license_plate()
              # (23) Replace serial number
              #     Replaces serial number with synthetic serial number.
              #     E.g. A1B2-C3D4-E5F6-7890 -> 9F8E-7D6C-5B4A-3C2D
              - name: device_serial_number
                value: fake.hexify(text="^^^^-^^^^-^^^^-^^^^", upper=False)
              # (24) Replace Web Universal Resource Locators (URLs)
              #     Replaces URL with synthetic URL.
              #     E.g. http://www.google.com -> https://green.info/
              - name: url
                value: fake.url()
              # (25) Replace Internet Protocol (IP) addresses
              #     Replaces IP address with synthetic IP address.
              #     E.g. 171.174.170.8 -> 95.25.112.121
              - name: ip_address
                value: fake.ipv4()
              # (26) Replace photographs and any comparable images
              #     Replaces images URLs with synthetic image URLs.
              #     E.g. https://picsum.photos/788/861 -> https://dummyimage.com/530x995
              - name: image
                value: fake.image_url()
              # (27) Replace universally unique identifying (UUID)
              #      Replaces UUID with synthetic UUID
              #      E.g. e3e70682-c209-4cac-a29f-6fbed82c07cd -> f728b4fa-4248-4e3a-8a5d-2f346baa9455
              - name: unique_identifier
                value: fake.uuid4()

"""


## 🏋 Train Gretel Model

In [24]:
import yaml

from gretel_client.projects import create_or_get_unique_project
from gretel_client.helpers import poll

# Create a project and model configuration.
project = create_or_get_unique_project(name="sample-tv2-job")

model = project.create_model_obj(
    model_config=yaml.safe_load(config), data_source=df
)

# Upload the training data.  Train the model.
model.submit_cloud()

poll(model, verbose=False)

Creating Transform V2 Model 
Generating Transform V2 data artifact... 
Loading model to device cuda:0 
Saving model archive 
Running model... 
Uploading artifacts to Gretel Cloud... 
Upload to Gretel Cloud is completed. 


In [25]:
# Use the model to generate synthetic data.
record_handler = model.create_record_handler_obj(data_source=df)

record_handler.submit_cloud()

poll(record_handler, verbose=False)

Loading model to worker 
Loading Transform V2 model... 
Loaded entities: {'first_name': None, 'last_name': None, 'full_name': None, 'suffix': None, 'date_of_birth': None, 'ssn': None, 'social_security_number': None, 'minimum_payment_amount': None, 'credit_limit': None, 'marital_status': None, 'full_address': None, 'address': None, 'city': None, 'postal_code': None, 'state': None, 'phone_number': None, 'email': None, 'loan_application_id': None, 'loan_id': None, 'bank_account_number': None, 'vehicle_identifier': None, 'license_plate': None, 'device_serial_number': None, 'url': None, 'ip_address': None, 'image': None, 'unique_identifier': None} 
Loading model to device cuda:0 
Running model... 
Uploading artifacts to Gretel Cloud... 
Upload to Gretel Cloud is completed. 


## 🔎 Compare Before and After

In [26]:
# Here is our "before"
df.head()

Unnamed: 0,first_name,last_name,full_name,suffix,date_of_birth,ssn,social_security_number,minimum_payment_amount,credit_limit,marital_status,full_address,address,city,postal_code,state,phone_number,email,loan_application_id,loan_id,bank_account_number,vehicle_identifier,license_plate,device_serial_number,url,ip_address,image,unique_identifier
0,John,Smith,John Smith,III,2010-02-16,123-45-6789,123-45-6789,500,5000.0,single,"48764 Howard Forge Apt. 421\nVanessaside, VT 79393",48764 Howard Forge Apt. 421,Vanessaside,79393,VT,123-456-7890,john.smith@gmail.com,A-1234,ABC12345,MYNB48764759382421,RT3GZYSKXXNDZ9J97,487-YNB,A1B2-C3D4-E5F6-7890,https://green.info/,171.174.170.81,https://picsum.photos/788/861,e3e70682-c209-4cac-a29f-6fbed82c07cd
1,Ada,Lovelace,Ada Lovelace,,1815-12-10,012-34-5678,012-34-5678,1000,10000.0,married,"778 Brown Plaza\nNorth Jenniferfurt, VT 88077",778 Brown Plaza,Jenniferfurt,88077,VT,234-567-8901,ada.lovelace@gmail.com,B-2345,DEF67890,TZIR92411578156593,G0K75MX77NULDXVG4,ZJ9 P3L,1A2B-3C4D-5E6F-7A8B,https://howard-snow.com/,95.25.112.121,https://dummyimage.com/530x995,f728b4fa-4248-4e3a-8a5d-2f346baa9455
2,Eric,Jones,Eric Jones,PhD,1999-07-05,234-56-7890,234-56-7890,1500,25000.0,married,"513 John Divide Suite 115\nRodriguezside, LA 93111",513 John Divide Suite 11,Rodriguezside,93111,LA,345-678-9012,eric.jones@yahoo.com,C-3456,HIJ09876,RPOQ40801609753513,16S1YMFL25CEF0V66,194-EJEY,9F8E-7D6C-5B4A-3C2D,http://campos.com/,51.105.121.194,https://dummyimage.com/621x976,eb1167b3-67a9-4378-bc65-c1e582e2e662
3,Grace,Hopper,Grace Hopper,,1906-12-09,456-78-9012,456-78-9012,2000,50000.0,single,"98 Wallace Ranch Suite 593\nIvanburgh, AZ 80818",98 Wallace Ranch Suite 593,Ivanburgh,80818,AZ,456-789-0123,grace.hopper@gmail.com,D-4567,KLM54321,SHHZ28711587148418,LZY7KJ8M0DJV6RLFJ,WZT-241,4E5F-6A7B-8C9D-0E1F,http://blair.com/,195.110.164.126,https://dummyimage.com/447x285,f7c1bd87-4da5-4709-9471-3d60c8a70639


In [27]:
# And here is our "after"
transformed = pd.read_csv(record_handler.get_artifact_link("data"), compression="gzip")
transformed.head()

Unnamed: 0,first_name,last_name,full_name,suffix,date_of_birth,ssn,social_security_number,minimum_payment_amount,credit_limit,marital_status,full_address,address,city,postal_code,state,phone_number,email,loan_application_id,loan_id,bank_account_number,vehicle_identifier,license_plate,device_serial_number,url,ip_address,image,unique_identifier
0,Matthew,Livingston,Matthew Livingston,MD,2010-02-11,6789,270-37-1204,41228,50499.8,XXXXX,"98718 Brown Hollow Apt. 658\nJoshuafurt, IA 90836",683 Larry Plains,Annstad,9375,NV,(811)626-8513,connerpenny@harrison.com,s-0187,bsK86000,BOVS71343462072808,JJNUB5VH7P9R0T5SU,PMS 269,4328-3151-7818-625c,http://www.wilson.com/,73.109.57.71,https://dummyimage.com/614x719,4903c836-8dbb-4ce5-abb8-3ed3eee64ef6
1,Diane,Cox,Diane Cox,,1815-12-07,5678,471-02-7718,63215,85851.81,XXXXX,"0857 Edwards Glens Suite 496\nSouth Waynemouth, NM 77637",8427 Lori Highway,South Gregorymouth,28694,NV,202-648-1404,smithanita@gmail.com,X-1792,Yhx63422,OVTO50871680074570,GC5FDFLE93Y1RNHBZ,3WG 369,6443-d64c-04a1-5003,https://christensen-collins.com/,19.179.156.67,https://picsum.photos/927/336,8d97ed3e-7e18-4d10-ab67-cd3f72d009a0
2,Stephen,Rodriguez,Stephen Rodriguez,DDS,1999-07-10,7890,005-49-7324,53205,90472.89,XXXXX,"1134 Ashley Cove Suite 892\nRiceton, ME 26148",479 John Gardens,Port Rebecca,14817,NM,8547803741,qreyes@yahoo.com,M-1796,gZO41490,PUWS32582244096773,L03DKE6K0B76M595D,OF4 Z0I,090b-ac80-b2b9-e8f0,http://www.hunt.info/,8.73.44.178,https://placekitten.com/848/874,09c11044-3446-43f6-939b-c2874cf3a2b3
3,Robert,Bell,Robert Bell,,1906-12-02,9012,826-76-3628,72428,92856.0,XXXXX,"42095 Marissa Valley\nWalshville, LA 13788",4228 David Walks Suite 809,West Jonview,74034,IN,2352397822,chenkeith@lewis.com,J-8691,Gpl35182,RKAA70533857887717,VHV71FD4XG5RLJ0UR,740-SMZ,c4ba-01af-184a-2b4b,https://www.moody.net/,218.248.28.46,https://picsum.photos/730/968,51e17d9e-e591-4794-9b10-848b1f1f37fb
