<a href="https://www.kaggle.com/code/scr0ll0/modeling-pii-data-detection?scriptVersionId=160013021" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

We will be using Presidio: a Microsoft framework for PII detection. Credit to pjmathematician for both the idea and the baseline code: https://www.kaggle.com/code/pjmathematician/pii-eda-presidio-baseline

# Downloading and Loading Libraries

In [1]:
%%capture
!pip install presidio_analyzer --no-index --find-links=file:///kaggle/input/presidio-wheels/presidio

In [2]:
import json
import pandas as pd
from tqdm.auto import tqdm
from presidio_analyzer import AnalyzerEngine, EntityRecognizer, PatternRecognizer, Pattern, RecognizerResult
from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer
from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
from presidio_analyzer.predefined_recognizers import PhoneRecognizer
from presidio_analyzer.recognizer_registry import RecognizerRegistry
import spacy

# Loading Data

In [3]:
train = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/train.json'))
test = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/test.json'))

# Functions

In [4]:
#Functions sourced from pjmathematician: https://www.kaggle.com/code/pjmathematician/pii-eda-presidio-baseline

def tokens2index(row):
    tokens  = row['tokens']
    start_ind = []
    end_ind = []
    prev_ind = 0
    for tok in tokens:
        start = prev_ind + row['full_text'][prev_ind:].index(tok)
        end = start+len(tok)
        start_ind.append(start)
        end_ind.append(end)
        prev_ind = end
    return start_ind, end_ind

In [5]:
def find_or_next_larger(arr, target):
    left, right = 0, len(arr) - 1

    while left <= right:
        mid = (left + right) // 2

        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    return left

In [6]:
def count_trailing_whitespaces(word):
    return len(word) - len(word.rstrip())

# Modeling

In [7]:
ALLOW_LIST = ['phone', 'number', 'telephone', 'cell', 'cellphone',
              'mobile', 'call', 'ph', 'tel', 'mobile', 'Email']
DENY_LIST = ["wikipedia", "coursera", ".pdf", ".PDF", "article",
             ".png",".gov", ".work", ".ai", ".firm", ".arts",
             ".store", ".rec", ".biz", ".travel"]

In [8]:
#Credit to leonshangguan for the Pattern Recognizers: 

address_regex = r'\b\d+\s+\w+(\s+\w+)*\s+((st(\.)?)|(ave(\.)?)|(rd(\.)?)|(blvd(\.)?)|(ln(\.)?)|(ct(\.)?)|(dr(\.)?))\b'
address_pattern = Pattern(name="address", regex=address_regex, score=0.5)
address_recognizer = PatternRecognizer(supported_entity="ADDRESS_CUSTOM", patterns = [address_pattern], context=["st", "Apt"])

email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
email_pattern = Pattern(name="email address", regex=email_regex, score=0.5)
email_recognizer = PatternRecognizer(supported_entity="EMAIL_CUSTOM", patterns = [email_pattern])

url_regex = r'https?://\S+|www\.\S+'
url_pattern = Pattern(name="url", regex=url_regex, score=0.5)
url_recognizer = PatternRecognizer(supported_entity="URL_CUSTOM", patterns = [url_pattern], deny_list=DENY_LIST)

phone_recognizer = PhoneRecognizer(context=['phone', 'number', 'telephone', 'cell', 'cellphone', 'mobile', 'call', 'ph', 'tel', 'mobile', 'Email'])

In [9]:
#Credit to Microsoft for the Numbers Recognizer, which can identify numbers within words like "Fifty":
#https://microsoft.github.io/presidio/tutorial/03_rule_based/

class NumbersRecognizer(EntityRecognizer):

    expected_confidence_level = 0.7  # expected confidence level for this recognizer

    def load(self) -> None:
        """No loading is required."""
        pass

    def analyze(self, text: str, entities: list[str], nlp_artifacts: NlpArtifacts) -> list[RecognizerResult]:
        """
        Analyzes test to find tokens which represent numbers (either 123 or One Two Three).
        """
        results = []

        # iterate over the spaCy tokens, and call `token.like_num`
        for token in nlp_artifacts.tokens:
            if token.like_num:
                result = RecognizerResult(
                    entity_type="NUMBER",
                    start=token.idx,
                    end=token.idx + len(token),
                    score=self.expected_confidence_level,
                )
                results.append(result)
        return results

new_numbers_recognizer = NumbersRecognizer(supported_entities=["NUMBER"])

In [10]:
#More credit to Microsoft for the engine configs: https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/

configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
}
provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine = provider.create_engine()

In [11]:
registry = RecognizerRegistry()
registry.load_predefined_recognizers()
registry.add_recognizer(address_recognizer)
registry.add_recognizer(email_recognizer)
registry.add_recognizer(url_recognizer)
registry.add_recognizer(new_numbers_recognizer)
registry.add_recognizer(phone_recognizer)

In [12]:
analyzer = AnalyzerEngine(supported_languages=['en'],
                          registry=registry,
                          nlp_engine=nlp_engine,
                          context_aware_enhancer=LemmaContextAwareEnhancer(
                              context_similarity_factor=0.8,
                              min_score_with_context_similarity=0.4
                          ))

## Training

In [13]:
preds = []

for i in tqdm(range(len(train))):
    start, end = tokens2index(train[i])
    train[i]['start'] = start
    train[i]['end'] = end

for i, d in enumerate(tqdm(train[:10])):
    results = analyzer.analyze(text=d['full_text'],
                               entities=["PHONE_NUMBER","PERSON","URL_CUSTOM","EMAIL_ADDRESS",
                                         "EMAIL_CUSTOM","ADDRESS_CUSTOM","US_SSN", "US_ITIN",
                                         "US_PASSPORT", "US_BANK_NUMBER","USERNAME"],
                               language='en',
                               allow_list=ALLOW_LIST)
    pre_preds = []
    for r in results:
        s = find_or_next_larger(d['start'], r.start)
        end = r.end
        word = d['full_text'][r.start:r.end]
        end = end - count_trailing_whitespaces(word)
        temp_preds = [s]
        try:
            while d['end'][s+1] <= end:
                temp_preds.append(s+1)
                s +=1
        except:
            pass
        if i==0:
            print(temp_preds, r.entity_type, word,r.score)
        
        if r.entity_type == 'PHONE_NUMBER':
            label =  'PHONE_NUM'
        if r.entity_type == 'PERSON':
            label =  'NAME_STUDENT'
        if r.entity_type == 'URL_CUSTOM':
            label = 'URL_PERSONAL'
        if r.entity_type == 'EMAIL_ADDRESS' or r.entity_type == 'EMAIL_CUSTOM':
            label = "EMAIL"
        if r.entity_type == 'ADDRESS_CUSTOM':
            label = 'STREET_ADDRESS'
        if r.entity_type in ['US_SSN', 'US_ITIN', 'US_PASSPORT', 'US_BANK_NUMBER']:
            label = 'ID_NUM'
        if r.entity_type == 'USERNAME':
            label =  'USERNAME'
        for p in temp_preds:
            if len(pre_preds) > 0:
                if pre_preds[-1]['rlabel'] == r.entity_type and ((p - pre_preds[-1]['token'])==1):
                    label_f = "I-"+label
                else:
                    label_f = "B-"+label
            else:
                label_f = "B-"+label
            pre_preds.append(({
#                     "row_id":i,
                    "document":d['document'],
                    "token":p,
                    "label":label_f,
                    "rlabel":r.entity_type
                }))
    preds.extend(pre_preds)

  0%|          | 0/6807 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

[9, 10] PERSON Nathalie Sylla

 0.85
[52, 53] PERSON Buzan T. 0.85
[55, 56] PERSON Buzan B. 0.85
[60, 61, 62, 63, 64] PERSON Dessine-moi  l'intelligence 0.85
[482, 483] PERSON Nathalie Sylla 0.85
[741, 742] PERSON Nathalie Sylla 0.85


## Testing

In [14]:
preds = []

In [15]:
for i in tqdm(range(len(test))):
    start, end = tokens2index(test[i])
    test[i]['start'] = start
    test[i]['end'] = end

  0%|          | 0/10 [00:00<?, ?it/s]

In [16]:
for i, d in enumerate(tqdm(test)):
    results = analyzer.analyze(text=d['full_text'],
                           entities=["PHONE_NUMBER","PERSON","URL_CUSTOM","EMAIL_ADDRESS",
                                     "EMAIL_CUSTOM","ADDRESS_CUSTOM","US_SSN", "US_ITIN",
                                     "US_PASSPORT", "US_BANK_NUMBER","USERNAME"],
                           language='en')
    pre_preds = []
    for r in results:
        s = find_or_next_larger(d['start'], r.start)
        end = r.end
        word = d['full_text'][r.start:r.end]
        end = end - count_trailing_whitespaces(word)
        temp_preds = [s]
        try:
            while d['end'][s+1] <= end:
                temp_preds.append(s+1)
                s +=1
        except:
            pass
        
        if r.entity_type == 'PHONE_NUMBER':
            label =  'PHONE_NUM'
        if r.entity_type == 'PERSON':
            label =  'NAME_STUDENT'
        if r.entity_type == 'URL_CUSTOM':
            label = 'URL_PERSONAL'
        if r.entity_type == 'EMAIL_ADDRESS' or r.entity_type == 'EMAIL_CUSTOM':
            label = "EMAIL"
        if r.entity_type == 'ADDRESS_CUSTOM':
            label = 'STREET_ADDRESS'
        if r.entity_type in ['US_SSN', 'US_ITIN', 'US_PASSPORT', 'US_BANK_NUMBER']:
            label = 'ID_NUM'
        if r.entity_type == 'USERNAME':
            label =  'USERNAME'
        for p in temp_preds:
            if len(pre_preds) > 0:
                if pre_preds[-1]['rlabel'] == r.entity_type and ((p - pre_preds[-1]['token'])==1):
                    label_f = "I-"+label
                else:
                    label_f = "B-"+label
            else:
                label_f = "B-"+label
            pre_preds.append(({
#                     "row_id":i,
                    "document":d['document'],
                    "token":p,
                    "label":label_f,
                    "rlabel":r.entity_type
                }))
    preds.extend(pre_preds)

  0%|          | 0/10 [00:00<?, ?it/s]

# Submission

In [17]:
submission = pd.DataFrame(preds).iloc[:,:-1].reset_index()
submission.columns = ['row_id','document', 'token', 'label']
submission

Unnamed: 0,row_id,document,token,label
0,0,7,9,B-NAME_STUDENT
1,1,7,10,I-NAME_STUDENT
2,2,7,52,B-NAME_STUDENT
3,3,7,53,I-NAME_STUDENT
4,4,7,55,B-NAME_STUDENT
...,...,...,...,...
106,106,123,1591,B-URL_PERSONAL
107,107,123,1648,B-URL_PERSONAL
108,108,123,1649,I-URL_PERSONAL
109,109,123,1690,B-URL_PERSONAL


In [18]:
submission.to_csv('submission.csv', index = False)