Skip to content
This repository has been archived by the owner on Feb 14, 2024. It is now read-only.

Commit

Permalink
Add redact_pii.py to code
Browse files Browse the repository at this point in the history
add code to redact pii info from a dataframe
  • Loading branch information
DJensen94 committed Jan 26, 2022
1 parent a76ccf4 commit cc524d4
Showing 1 changed file with 202 additions and 0 deletions.
202 changes: 202 additions & 0 deletions src/pe_reports/redact_pii.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
"""Functions to redact PII from a dataframe."""
# importing pandas as pd
# Standard Python Libraries
import re

# Third-Party Libraries
import scrubadub
import scrubadub.detectors.date_of_birth

CA = ["(?:(?<=\\s)|(?<=^))[a-zA-Z]\\d{7}(?=$|\\s)"]
CO = ["(?:(?<=\\s)|(?<=^))\\d{2}-\\d{3}-\\d{4}(?=$|\\s)"]
FL = [
"(?:(?<=\\s)|(?<=^))[a-zA-Z] \\d{3} \\d{3} \\d{3} \\d{3}(?=$|\\s)",
"(?:(?<=\\s)|(?<=^))[a-zA-Z]\\d{3}-\\d{3}-\\d{2}-\\d{3}-\\d(?=$|\\s)",
"(?:(?<=\\s)|(?<=^))[a-zA-Z]-\\d{3}-\\d{3}-\\d{3}-\\d{3}(?=$|\\s)",
]
HI_NE_VA = ["(?:(?<=\\s)|(?<=^))[a-zA-Z]\\d{8}(?=$|\\s)"]
ID = ["(?:(?<=\\s)|(?<=^))[a-zA-Z]{2}\\d{6}[a-zA-Z](?=$|\\s)"]
IL = [
"(?:(?<=\\s)|(?<=^))[a-zA-Z]\\d{3}-\\d{4}-\\d{4}(?=$|\\s)",
"(?:(?<=\\s)|(?<=^))[a-zA-Z]\\d{11}(?=$|\\s)",
]
IO = ["(?:(?<=\\s)|(?<=^))\\d{4}-\\d{2}-\\d{4}(?=$|\\s)"]
IA = ["(?:(?<=\\s)|(?<=^))\\d{3}[a-zA-Z]{2}\\d{4}(?=$|\\s)"]
KS = ["(?:(?<=\\s)|(?<=^))[a-zA-Z]\\d{2}-\\d{2}-\\d{4}(?=$|\\s)"]
KY = ["(?:(?<=\\s)|(?<=^))[a-zA-Z]\\d{2}-\\d{3}-\\d{3}(?=$|\\s)"]
MD = ["(?:(?<=\\s)|(?<=^))[a-zA-Z]-\\d{3}-\\d{3}-\\d{3}-\\d{3}(?=$|\\s)"]
MI = ["(?:(?<=\\s)|(?<=^))[a-zA-Z]\\s\\d{3}\\s\\d{3}\\s\\d{3}\\s\\d{3}(?=$|\\s)"]
MN_FL_MD_MI = ["(?:(?<=\\s)|(?<=^))[a-zA-Z]\\d{12}(?=$|\\s)"]
MO_OK = ["(?:(?<=\\s)|(?<=^))[a-zA-Z]\\d{9}(?=$|\\s)"]
NH = [
"(?:(?<=\\s)|(?<=^))([0][1-9]|[1][0-2])[a-zA-Z]{3}\\d{2}(0[1-9]|[1-2][0-9]|3[0-1])\\d(?=$|\\s)"
]
NJ = [
"(?:(?<=\\s)|(?<=^))[a-zA-Z]\\d{4}-\\d{5}-\\d{5}(?=$|\\s)",
"(?:(?<=\\s)|(?<=^))[a-zA-Z]\\d{14}(?=$|\\s)",
]
NY = ["(?:(?<=\\s)|(?<=^))\\d{3} \\d{3} \\d{3}(?=$|\\s)"]
ND = ["(?:(?<=\\s)|(?<=^))[a-zA-Z]{3}-\\d{2}-\\d{4}(?=$|\\s)"]
OH = ["(?:(?<=\\s)|(?<=^))[a-zA-Z]{3}-\\d{2}-\\d{4}(?=$|\\s)"]
PA = ["(?:(?<=\\s)|(?<=^))\\d{2}\\s\\d{3}\\s\\d{3}(?=$|\\s)"]
VT = ["(?:(?<=\\s)|(?<=^))\\d{7}[a-zA-Z](?=$|\\s)"]
VA = ["(?:(?<=\\s)|(?<=^))[a-zA-Z]\\d{2}-\\d{2}-\\d{4}(?=$|\\s)"]
WA = ["(?:(?<=\\s)|(?<=^))[a-zA-Z]{3}\\*\\*[a-zA-Z]{2}\\d{3}[a-zA-Z]\\d(?=$|\\s)"]
WV = ["(?:(?<=\\s)|(?<=^))[a-zA-Z]\\d{6}(?=$|\\s)"]
WI = ["(?:(?<=\\s)|(?<=^))[a-zA-Z]\\d{3}-\\d{4}-\\d{4}-\\d{2}(?=$|\\s)"]
WY = ["(?:(?<=\\s)|(?<=^))\\d{6}-\\d{3}(?=$|\\s)"]

all_DL = (
CA
+ CO
+ FL
+ HI_NE_VA
+ ID
+ IL
+ IO
+ IA
+ KS
+ KY
+ MD
+ MI
+ MN_FL_MD_MI
+ MO_OK
+ NJ
+ NY
+ ND
+ OH
+ PA
+ VT
+ VA
+ WA
+ WV
+ WI
+ WY
+ NH
)
# all_DL = IA
# Build a detector to find Drivers License ID


class DLFilth(scrubadub.filth.Filth):
"""Create filth class for drivers license."""

type = "drivers_license"


class DLDetector(scrubadub.detectors.RegexDetector):
"""Create detector class for driverse license."""

name = "drivers_license"
regex = re.compile("|".join(all_DL), re.IGNORECASE)
filth_cls = DLFilth


# Build a detector to find credit card numbers
class CreditFilth(scrubadub.filth.Filth):
"""Create filth class for credit cards."""

type = "credit_card"


class CreditDetector(scrubadub.detectors.RegexDetector):
"""Create detector class for credit cards."""

name = "credit card"
regex = re.compile(
"\b((4\\d{3}|5[1-5]\\d{2}|2\\d{3}|3[47]\\d{1,2})[\\s\\-]?\\d{4,6}[\\s\\-]?\\d{4,6}?([\\s\\-]\\d{3,4})?(\\d{3})?)\b",
re.IGNORECASE,
)
filth_cls = CreditFilth


# Build a detector to find Social security numbers with no spaces
class SSNFilth(scrubadub.filth.Filth):
"""Create filth class for social security numbers."""

type = "social_security_number"


class SSNDetector(scrubadub.detectors.RegexDetector):
"""Create detector class for social security numbers."""

name = "no_space_ssn"
regex = re.compile(
r"(?:(?<=\s)|(?<=^))(social security number|Social Security No|Social Security #|social security number|social|Social|ssn)\W*(?!219099999|078051120)(?!666|000|9\d{2})\d{3}(?!00)\d{2}(?!0{4})\d{4}(?=$|\s)",
re.IGNORECASE,
)
filth_cls = SSNFilth


# Build a detector that finds passport numbers based off of previous context
class PassportFilth(scrubadub.filth.Filth):
"""Create filth class for passport numbers."""

type = "passport"


class PassportDetector(scrubadub.detectors.RegexDetector):
"""Create detector class for passport numbers."""

name = "passport"
regex = re.compile(
r"(Passport Number|Passport No|Passport #|Passport#|PassportID|Passportno|passportnumber)\W*\d{9}",
re.IGNORECASE,
)
filth_cls = PassportFilth


# Build a detector that identifies Alien Id numbers
class AlienIdFilth(scrubadub.filth.Filth):
"""Create filth class for alien identification numbers."""

type = "alien id"


class AlienIdDetector(scrubadub.detectors.RegexDetector):
"""Create detector class for alien identification numbers."""

name = "alien id"
regex = re.compile(
r"^(([A-Za-z]{3}[0-9]{10})|([A-Za-z]{3}(\s)([0-9]{2}(\s)[0-9]{3}(\s)[0-9]{5})))$",
re.IGNORECASE,
)
filth_cls = AlienIdFilth


emails = r"\b([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])\b"
all_cards = r"\b((4\d{3}|5[1-5]\d{2}|2\d{3}|3[47]\d{1,2})[\s\-]?\d{4,6}[\s\-]?\d{4,6}?([\s\-]\d{3,4})?(\d{3})?)\b"
US_phones = r"((\+|\b)[1l][\-\. ])?\(?\b[\dOlZSB]{3,5}([\-\. ]|\) ?)[\dOlZSB]{3}[\-\. ][\dOlZSB]{4}\b"
US_street_address = r"\d{1,8}\b[\s\S]{10,100}?\b(AK|AL|AR|AZ|CA|CO|CT|DC|DE|FL|GA|HI|IA|ID|IL|IN|KS|KY|LA|MA|MD|ME|MI|MN|MO|MS|MT|NC|ND|NE|NH|NJ|NM|NV|NY|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VA|VT|WA|WI|WV|WY)\b\s\d{5}"


def redact_pii(df, column_list=[]):
"""Run through provided columns and redact PII."""
# df = df.replace(regex={email: 'email', email2: 'email2', ssn1:'ssn1', ssn2:'ssn2', ssn3:'ssn3', US_phones: 'Phone Number', all_cards:'credit card'})

if column_list:
for column in column_list:
df = scrub(df, column)
# df[column] = df[column].replace(regex={all_cards:'{{CREDIT_CARD}}'})
else:
for column in df.columns:
df = scrub(df, column)
# df = df.replace(regex={all_cards:'{{CREDIT_CARD}}'})
return df


def scrub(df, column):
"""Add different scrubber classes and run column through scrubadub."""
scrubadub.filth.date_of_birth.DateOfBirthFilth.min_age_years = 5
scrubber = scrubadub.Scrubber()
scrubber.add_detector(scrubadub.detectors.date_of_birth.DateOfBirthDetector())
scrubber.add_detector(SSNDetector)
scrubber.add_detector(PassportDetector)
scrubber.add_detector(AlienIdDetector)
scrubber.add_detector(DLDetector)
scrubber.add_detector(CreditDetector)
scrubber.remove_detector("url")
scrubber.remove_detector("email")
scrubber.remove_detector("phone")
df[column] = df[column].apply(lambda x: scrubber.clean(x))
return df

0 comments on commit cc524d4

Please sign in to comment.