In [1]:
# Import necessary libraries
import pandas as pd
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

In [26]:
# Define the service key and endpoint of Azure Text Analytics
key = "PLEASE_ENTER_YOUR_OWNED_AZURE_TEXT_ANALYTICS_SERVICE_KEY"
endpoint = "https://PLEASE_ENTER_YOUR_OWNED_AZURE_TEXT_ANALYTICS_INSTANCE_NAME.cognitiveservices.azure.com/"

In [3]:
# Function to connect and authenticate with Azure Text Analytics 
def authenticate_client():
    ta_credential = AzureKeyCredential(key)
    text_analytics_client = TextAnalyticsClient(
            endpoint=endpoint, 
            credential=ta_credential)
    return text_analytics_client

client = authenticate_client()

In [4]:
documents = [
    "4111 1111 1111 1111, 4111111111111111, 4111-1111-1111-1111", # Sample variation of Visa Card numbers.
    "5105 1051 0510 5100, 5105105105105100, 5105-1051-0510-5100", # Sample variation of Master Card numbers.
    "56681231, +862056681231, 85256681231, 5668 1231", # Sample variation of phone numbers.
    "My name is Eason Lai, My name is Eason, Eason, Mr. Lai, Miss. Kwan, Mrs Lai, Dr Lai", # Sample variation of names.
    "easonlai@msn.com" # Sample email address.
]
response = client.recognize_pii_entities(documents, language="en")
result = [doc for doc in response if not doc.is_error]
for doc in result:
    print("PII Scrubbed Text: {}".format(doc.redacted_text))

PII Scrubbed Text: *******************, ****************, *******************
PII Scrubbed Text: *******************, ****************, *******************
PII Scrubbed Text: ********, +************, ***********, *********
PII Scrubbed Text: My name is *********, My name is *****, *****, Mr. ***, Miss. ****, Mrs ***, Dr ***
PII Scrubbed Text: ****************


In [5]:
# Read PII Sample Data into Pandas dataframe
pii_df = pd.read_csv('data/pii-sample-data.csv', sep='|')

In [6]:
# Show top 5 records in PII Sample dataframe
pii_df.head()

Unnamed: 0,PII-Sample-Data
0,5555 5537 5304 8194
1,5555553753048194
2,5555-5537-5304-8194
3,5555 5555 5555 4444
4,5555555555554444


In [7]:
# Count number of row in dataframe
pii_df.count()

PII-Sample-Data    63
dtype: int64

First level of PII Data Scrubbing by Azure Text Analytics

In [8]:
# First level of PII Data Scrubbing by Azure Text Analytics, PII data like, Visa Card number, Master Card number,
# American Express Card number, Phone number, Name, Address, Email Address will being scrubbed.
scrubbed_pii_df = []

for index, headers in pii_df.iterrows():
    pii_row = [str(headers['PII-Sample-Data'])]
    response = client.recognize_pii_entities(pii_row, language="en")
    result = [doc for doc in response if not doc.is_error]
    for doc in result:
        scrubbed_text = doc.redacted_text
        #print("{}".format(doc.redacted_text))
        scrubbed_pii_df.append([scrubbed_text])
        
scrubbed_pii_df = pd.DataFrame(scrubbed_pii_df, columns=['Scrubbed-PII-Sample-Data'])
print('Scrubbed PII Dataframe Created')

Scrubbed PII Dataframe Created


In [9]:
# Show top 5 records in Scrubbed PII dataframe
scrubbed_pii_df.head()

Unnamed: 0,Scrubbed-PII-Sample-Data
0,*******************
1,****************
2,*******************
3,*******************
4,****************


In [10]:
# Count number of row in Scrubbed PII dataframe 
scrubbed_count = scrubbed_pii_df['Scrubbed-PII-Sample-Data'].str.contains("\*")
scrubbed_count.count()

63

In [11]:
# Display any row does not contain masked value *
scrubbed_pii_df[~scrubbed_pii_df['Scrubbed-PII-Sample-Data'].str.contains("\*")]

Unnamed: 0,Scrubbed-PII-Sample-Data
57,A123459(2)
58,Z126659 3
59,A123459-3
60,Z 123459-3
61,my id card number is A123459(2)
62,my hk id is Z126659 3


Second level of PII Data Scrubbing by re library

In [13]:
# Export Scrubbed PII dataframe to csv file
scrubbed_pii_df.to_csv('data\pii-scrubbed-sample-data.csv', encoding='utf-8', index=False)

In [14]:
# Import necessary libraries
import pandas as pd
import re

In [15]:
# Read PII Sample Data into Pandas dataframe
pii_df_2 = pd.read_csv('data/pii-scrubbed-sample-data.csv', sep='|')

In [16]:
# Show top 5 records in PII Sample dataframe
pii_df_2.head()

Unnamed: 0,Scrubbed-PII-Sample-Data
0,*******************
1,****************
2,*******************
3,*******************
4,****************


In [17]:
# Count number of row in dataframe
pii_df_2.count()

Scrubbed-PII-Sample-Data    63
dtype: int64

In [19]:
# Second level of PII Data Scrubbing by re library, PII data like Hong Kong Identity Card (HK ID) will being scrubbed.
scrubbed_pii_df_2 = []

for index, headers in pii_df_2.iterrows():
    pii_row = str(headers['Scrubbed-PII-Sample-Data'])
    pii_scrubbed_row = re.sub(r'(\d+(?:-\d+)*)', '*', pii_row)
    #print(pii_scrubbed_row)
    scrubbed_pii_df_2.append([pii_scrubbed_row])

scrubbed_pii_df_2 = pd.DataFrame(scrubbed_pii_df_2, columns=['Scrubbed-PII-Sample-Data'])
print('Scrubbed PII Dataframe Created')

Scrubbed PII Dataframe Created


In [22]:
# Show top 5 records in Scrubbed PII dataframe
scrubbed_pii_df_2.head()

Unnamed: 0,Scrubbed-PII-Sample-Data
0,*******************
1,****************
2,*******************
3,*******************
4,****************


In [23]:
# Count number of row in Scrubbed PII dataframe 
scrubbed_count_2 = scrubbed_pii_df_2['Scrubbed-PII-Sample-Data'].str.contains("\*")
scrubbed_count_2.count()

63

In [24]:
# Display any row does not contain masked value *
scrubbed_pii_df_2[~scrubbed_pii_df_2['Scrubbed-PII-Sample-Data'].str.contains("\*")]

Unnamed: 0,Scrubbed-PII-Sample-Data


In [25]:
# Export Scrubbed PII dataframe to csv file
scrubbed_pii_df_2.to_csv('data\pii-scrubbed-sample-data-2.csv', encoding='utf-8', index=False)