In [None]:
import json

import yaml
from smart_open import open
import pandas as pd

from gretel_client import create_project, submit_docker_local

data_source = "https://gretel-public-website.s3.us-west-2.amazonaws.com/datasets/USAdultIncome5k.csv"

# Policy to search for "sensitive PII" as defined by
# https://www.experian.com/blogs/ask-experian/what-is-personally-identifiable-information/
config = """
schema_version: 1.0
models:
  - classify:
      data_source: "_"
      labels:
        - person_name
        - credit_card_number
        - phone_number
        - us_social_security_number
        - email_address
"""


In [None]:
project = create_project()

In [None]:
# the following cell will create the classification model and 
# run a sample of the data set through the model. this sample
# can be used to ensure the model is functioning correctly
# before continuing.
classify = project.create_model_obj(
    model_config=yaml.safe_load(config),
    data_source=data_source
)

run = submit_docker_local(classify, output_dir="tmp/")

In [None]:
# review the sampled classification report
report = json.loads(open("tmp/report_json.json.gz").read())
pd.DataFrame(report["metadata"]["fields"])

In [None]:
# next let's classify the remaining records using the model
# that was just created.
classify_records = classify.create_record_handler_obj(data_source=data_source)

run = submit_docker_local(
    classify_records,
    model_path="tmp/model.tar.gz",
    output_dir="tmp/"
)

In [None]:
report = json.loads(open("tmp/report_json.json.gz").read())
pd.DataFrame(report["metadata"]["fields"])