# Install dependencies

Your notebook runtime will restart after the installation is complete.

**DO NOT run any other cells until this one has finished executing and the runtime has restarted.**


In [None]:
!apt -q install poppler-utils tesseract-ocr
!pip install -Uqq gretel_client langchain_community unstructured[pdf]
# Restart kernel to complete installation
get_ipython().kernel.do_shutdown(True)

Reading package lists...
Building dependency tree...
Reading state information...
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  poppler-utils tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 4 newly installed, 0 to remove and 45 not upgraded.
Need to get 5,003 kB of archives.
After this operation, 16.3 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.4 [186 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 5,003 kB in 2s (2,053 kB/s)
Selecting previously unselected package poppler-utils.
(Rea

{'status': 'ok', 'restart': True}

# Choose the PDF files to anonymize

In [None]:
# @title Run this cell then click on the button below to select your PDF files
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB: # Use Colab widgets
  uploaded = google.colab.files.upload()
else: # Use vanilla Jupyter widgets
  import ipywidgets as widgets
  uploader = widgets.FileUpload(
      accept=".pdf",  # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
      multiple=True  # True to accept multiple files upload else False
  )
  display(uploader)

Saving invoice.pdf to invoice.pdf


In [None]:
# @title Run this cell to validate and save the selected files
NOT_PDF_ERROR = TypeError(
    "Please try uploading again.\n"
    "This notebook is designed to work with PDFs, and at least one of the selected files is not a valid PDF.\n"
    "If you're interested in anonymizing files of other types, feel free to make a copy of this notebook and replace the UnstructuredPDFLoader with a different document loader: https://python.langchain.com/docs/modules/data_connection/document_loaders/."
    )

if IN_COLAB:
  if any([f[-4:].lower() != ".pdf" for f in uploaded]):
    raise NOT_PDF_ERROR
  files = list(uploaded.keys())
else:
  files = []
  for name, value in uploader.value.items():
    if value["metadata"]["type"] != "application/pdf":
      raise NOT_PDF_ERROR
    with open(name, "wb") as fp:
      fp.write(value["content"])
    files.append(name)

  if not files:
    raise ValueError("Please click on Upload above and choose one or more PDF files before running this section.")

files

['invoice.pdf']

# Extract text contents from the uploaded files

We use the [unstructured](https://github.com/Unstructured-IO/unstructured) library to extract the text and perform OCR (Optical Character Recognition) if necessary.

Unstructured supports many other file types, in addition to PDF. Feel free to make a copy of this notebook and experiment with changing the loader to anonymize images, Office documents, or other file formats.

In [None]:
#@title Use UnstructuredPDFLoader to extract text from PDFs
from langchain_community.document_loaders import UnstructuredPDFLoader
import pandas as pd
pd.set_option('display.max_colwidth', 2048)

data = []
for filename in files:
  loader = UnstructuredPDFLoader(filename)
  data.append((filename, loader.load()[0].page_content))

input_df = pd.DataFrame(data, columns=["filename", "content"])
input_df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Unnamed: 0,filename,content
0,invoice.pdf,"Invoice no: 40378170\n\nDate of issue: 10/15/2012\n\nSeller: Client:\n\nPatel, Thompson and Montgomery 356 Kyle Vista New James, MA 46228\n\nJackson, Odonnell and Jackson 267 John Track Suite 841 Jenniferville, PA 98601\n\nTax Id: 958-74-3511\n\nTax Id: 998-87-7723 IBAN: GB77WRBQ31965128414006\n\nITEMS No. Description Qty UM Net price Net worth VAT [%] Gross worth i, Leed's Wine Companion Bottle 1,00 each 7,50 7,50 10% 8,25 Corkscrew Opener Gift Box Set\n\nwith Foil Cutter\n\nSUMMARY\n\nVAT [%] Net worth VAT Gross worth\n\n10% 7,50 0,75 8,25\n\nTotal $ 7,50 $0,75 $ 8,25"


# Anonymize contents using Gretel Transform v2

In [None]:
#@title Configure the Gretel session
from getpass import getpass
from gretel_client import configure_session, ClientConfig

configure_session(ClientConfig(api_key=getpass(prompt="Enter your Gretel API key "),
                               endpoint="https://api.gretel.cloud"))

Enter your Gretel API key ··········


In [None]:
#@title Build a Transform v2 configuration

#@markdown Which PII entity types are you looking to detect? Enter a comma-separated list below. Feel free to try arbitrary entity types unique to your business (YMMV).
entity_types = "invoice_number, company, address, iban, ssn, product_name" # @param {type:"string"}

#@markdown What would you like to replace detected PII entities with?
#@markdown * `redact_entities` replaces with the entity type, for example `Sally` becomes `<first_name>`
#@markdown * `hash_entities` replaces with a salted hash of the detected content, for example `Sally` could become `515acf74f`
#@markdown * `fake_entities` replaces with a random fake value of the same entity type, for example `Sally` could become `Joe` (as a fallback, entity types that are not available in Faker are redacted)
#@markdown * `label_entities` replaces with the entity type and value, for example `Sally` becomes `<entity type="first_name" value="Sally">` (this is useful for downstream processing, for example to generate an HTML report showing both the entity type and value)
goal = "label_entities" # @param ["redact_entities", "label_entities", "hash_entities", "fake_entities"]
#@markdown What should the detection confidence threshold be? A lower threshold increases the number of entities detected, while a higher threshold decreases them. 0.2-0.5 is a good starting range.
threshold = 0.3 # @param {type:"slider", min:0, max:1, step:0.05}

sanitized_entities = [s.strip() for s in entity_types.split(",")]

config = """
schema_version: "1.0"
models:
  - transform_v2:
      globals:
        classify:
          entities: {}
          ner_treshold: {}
      steps:
        - rows:
            update:
              - name: content
                value: this | {}
""".format("\n            - ".join([""] + sanitized_entities), threshold, goal).strip()
print(config)

#@markdown Below is the resulting Transform v2 model config:

schema_version: "1.0"
models:
  - transform_v2:
      globals:
        classify:
          entities: 
            - invoice_number
            - company
            - address
            - iban
            - tax_registration_number
            - product_name
          ner_treshold: 0.3
      steps:
        - rows:
            update:
              - name: content
                value: this | label_entities


In [None]:
#@title Run Transform v2 and print the transformed contents
import yaml
from gretel_client.projects.projects import tmp_project
from gretel_client.helpers import poll

def transform(df: pd.DataFrame, config: str) -> pd.DataFrame():
    with tmp_project() as project:
      data_source = "data.jsonl"
      df.to_json(data_source, orient="records", lines=True)
      model = project.create_model_obj(model_config=yaml.safe_load(config), data_source=data_source)
      model.submit_cloud()
      poll(model)
      output_df = pd.read_json(model.get_artifact_link("data_preview"), compression="gzip", lines=True)
    return output_df

output_df = transform(input_df, config)
output_df

INFO: Starting poller


{
    "uid": "6657e5120827467d45c1f5d9",
    "guid": "model_2hAUJqWGetqA1HmNSqi18PN0Pxo",
    "model_name": "nutty-coordinated-lion",
    "model_key": "",
    "runner_mode": "cloud",
    "user_id": "616dc68cbff62105ec179221",
    "user_guid": "user_26hlnkM1DY9YgOkZoTtEBQpHGpo",
    "billing_domain": "gretel.ai",
    "billing_domain_guid": "domain_28eujAnf9EFme26oSFok8xCUT4n",
    "project_id": "6657e50f661fbc1e68d05508",
    "project_guid": "proj_2hAUJStxz3FPzrSC70miSM424P4",
    "cluster_guid": null,
    "status_history": {
        "created": "2024-05-30T02:31:46.345680Z"
    },
    "last_modified": "2024-05-30T02:31:46.449091Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "annotations": null,
    "provenance": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/models/transform_v2@sha256:b13a2fe2ca6290ff3c77ee781bfcd861585bad3467775ffa02da285a59133f1a"

INFO: Status is pending. A Gretel Cloud worker is being allocated to begin model creation.
INFO: Status is active. A worker has started creating your model!
2024-05-30T02:32:04.643178Z  Creating Transform V2 Model
2024-05-30T02:32:04.644230Z  Generating Transform V2 data artifact...
2024-05-30T02:32:24.674666Z  Saving model archive
2024-05-30T02:32:24.728511Z  Running model...
2024-05-30T02:32:27.521340Z  Uploading artifacts to Gretel Cloud...
2024-05-30T02:32:28.464870Z  Upload to Gretel Cloud is completed.


Unnamed: 0,filename,content
0,invoice.pdf,"<entity type=""invoice_number"" value=""Invoice no: 40378170"">\n\nDate of issue: 10/15/2012\n\nSeller: Client:\n\n<entity type=""company"" value=""Patel, Thompson and Montgomery""> <entity type=""address"" value=""356 Kyle Vista New James, MA 46228"">\n\n<entity type=""company"" value=""Jackson, Odonnell and Jackson""> <entity type=""address"" value=""267 John Track Suite 841 Jenniferville, PA 98601"">\n\n<entity type=""tax_registration_number"" value=""Tax Id: 958-74-3511"">\n\n<entity type=""tax_registration_number"" value=""Tax Id: 998-87-7723""> <entity type=""iban"" value=""IBAN: GB77WRBQ31965128414006"">\n\nITEMS No. Description Qty UM Net price Net worth VAT [%] Gross worth i, <entity type=""product_name"" value=""Leed's Wine Companion Bottle""> 1,00 each 7,50 7,50 10% 8,25 <entity type=""product_name"" value=""Corkscrew Opener Gift Box Set\n\nwith Foil Cutter"">\n\nSUMMARY\n\nVAT [%] Net worth VAT Gross worth\n\n10% 7,50 0,75 8,25\n\nTotal $ 7,50 $0,75 $ 8,25"


# (Optional) Create an HTML report with the PII entities tagged within the text

Note: this requires selecting the `label_entities` goal above.

In [None]:
# @title Define utility functions to extract entities to spaCy Doc object and create HTML report
import itertools
import random
import re
from IPython.display import display, HTML
from spacy import displacy


def make_entity_counts_html(docs):
    entity_counts = {}
    for doc in docs:
      for ent in doc["ents"]:
        label = ent["label"]
        if label not in entity_counts:
          entity_counts[label] = 0
        entity_counts[label] += 1
    entity_counts = sorted(entity_counts.items(), key=lambda x: x[1], reverse=True)
    html = "<div style='border: 1px solid #ccc; padding: 10px;'>"
    html += "<h3>Entity Type Counts</h3>"
    html += "<ul>"
    for entity_type, count in entity_counts:
      html += f"<li><span style='font-weight: bold;'>{entity_type}:</span> {count}</li>"
    html += "</ul>"
    html += "</div>"
    html += "<br/>"
    return html


def render_entity_counts(docs):
  html = make_entity_counts_html(docs)
  display(HTML(html))


def make_colors(min_intensity=96, max_intensity=255, intensity_step=32, min_total_intensity=384, allow_gray=False):
  intensities = list(range(min_intensity, max_intensity + 1, intensity_step))
  colors = list(itertools.product(intensities, intensities, intensities))
  random.shuffle(colors)
  colors = [f"rgb({r}, {g}, {b})" for r, g, b in colors if (allow_gray or r != g or r != b or b != g) and r + g + b >= min_total_intensity]
  return colors


def render_entities_html(docs, **kwargs):
    colors = make_colors()
    num_colors = len(colors)
    all_entity_types = set(ent["label"] for doc in docs for ent in doc["ents"])
    options = {
      "ents": list(all_entity_types),
      "colors": {label: colors[i % num_colors] for i, label in enumerate(all_entity_types)},
      "compact": True
    }
    options.update(kwargs)
    return displacy.render(docs, style="ent", manual=True, jupyter=kwargs.get("jupyter", True), options=options)


def create_spacy_entities_doc(text):
  original_text = text
  entity_regex = re.compile("<entity type=\"(.+?)\" value=\"(.+?)\">", re.S)
  entities = []
  index_shift = 0
  for m in entity_regex.finditer(text):
    value = m.group(2)
    start = m.start() - index_shift
    value_length = len(value)
    end = start + value_length
    entities.append({
        "label": m.group(1),
        "text": value,
        "start": start,
        "end": end
        })
    tag_length = len(m.group())
    original_text = original_text[:start] + value + original_text[start+tag_length:]
    index_shift += tag_length - value_length
  return {"text": original_text, "ents": entities}

In [None]:
# @title Build, save, and display the report
if goal != "label_entities":
    raise ValueError("Please select the label_entities goal and re-run Transform v2")

docs = [create_spacy_entities_doc(d) for d in output_df["content"].tolist()]

with open("report.html", "w") as html_file:
    contents = make_entity_counts_html(docs)
    contents += render_entities_html(docs, page=True, jupyter=False)
    html_file.write(contents)

render_entity_counts(docs)
render_entities_html(docs, jupyter=True)