# Model & Evaluation

## 1. Load Packages and Datasets

In [1]:
import ast
import json
import numpy as np
import pandas as pd
import spacy
from tqdm.auto import tqdm
from dateutil import parser
from presidio_analyzer import AnalyzerEngine, EntityRecognizer, PatternRecognizer, Pattern, RecognizerResult
from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer
from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
from presidio_analyzer.predefined_recognizers import EmailRecognizer, UrlRecognizer, PhoneRecognizer
from presidio_analyzer.recognizer_registry import RecognizerRegistry
from sklearn.model_selection import train_test_split

from collections import Counter
from nltk.corpus import stopwords

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Avoiding usage of TEST dataset as it is the first 10 rows of TRAIN dataset
train = pd.read_json('./../data/train.json')
#test = pd.read_json('./../data/test.json')

# Load pre-processed train and test set from EDA notebook
preprocessed_train_df = pd.read_json('./../data/preprocess_train.json')
#preprocessed_test_df = pd.read_json('./../data/preprocess_test.json')


us_names = pd.read_csv('./../data/NationalNames.csv')['Name'].str.upper().unique()

nltk_female = open('./../data/names/female.txt').read().split('\n')
nltk_male = open('./../data/names/male.txt').read().split('\n')

nltk_female = [i.upper() for i in nltk_female]
nltk_male = [i.upper() for i in nltk_male]

french_dept = pd.read_csv('./../data/departmental_names.csv')['name'].str.upper().unique()
french_nat = pd.read_csv('./../data/national_names.csv')['name'].str.upper().unique()

wikipedia = pd.read_csv('./../data/people_wiki.csv')['name'].str.upper().tolist()

In [3]:
def find_index(row):
    tokens  = row['tokens']
    start_ind = []
    end_ind = []
    prev_ind = 0
    for tok in tokens:
        start = prev_ind + row['full_text'][prev_ind:].index(tok)
        end = start+len(tok)
        start_ind.append(start)
        end_ind.append(end)
        prev_ind = end
    return start_ind, end_ind

def find_larger(arr, target):
    left, right = 0, len(arr) - 1

    while left <= right:
        mid = (left + right) // 2

        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    return left

def count_trailing_whitespaces(word):
    return len(word) - len(word.rstrip())

def is_valid_date(text):
    try:
        parsed_date = parser.parse(text)
        return True
    except:
        return False
    

def pii_fbeta_score(pred_df, gt_df,beta=5):

    df = pred_df.merge(gt_df,how='outer',on=['document',"token"],suffixes=('_pred','_gt'))

    df['cm'] = ""

    df.loc[df.label_gt.isna(),'cm'] = "FP"
    df.loc[df.label_pred.isna(),'cm'] = "FN"
    df.loc[(df.label_gt.notna()) & (df.label_gt!=df.label_pred),'cm'] = "FN"

    df.loc[(df.label_pred.notna()) & (df.label_gt.notna()) & (df.label_gt==df.label_pred),'cm'] = "TP"
    
    FP = (df['cm']=="FP").sum()
    FN = (df['cm']=="FN").sum()
    TP = (df['cm']=="TP").sum()
    
    precision = TP/(TP + FP)
    recall = TP/(TP + FN)
    f1 = precision * recall / (precision + recall)
    
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("F1-Score " + str(f1))

    s_micro = (1+(beta**2))*TP/(((1+(beta**2))*TP) + ((beta**2)*FN) + FP)

    return s_micro

In [4]:
x = pd.DataFrame(preprocessed_train_df)
y = x['labels']
x = x.drop(columns='labels')
train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=0.25, random_state=0)

In [5]:
print("Train_X: ")
train_x.info()
print("Val_X: ")
val_x.info()
print("Train_Y: ")
train_y.info()
print("Val_Y: ")
val_y.info()

Train_X: 
<class 'pandas.core.frame.DataFrame'>
Index: 5105 entries, 276 to 2732
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   document             5105 non-null   int64 
 1   full_text            5105 non-null   object
 2   tokens               5105 non-null   object
 3   trailing_whitespace  5105 non-null   object
 4   tokens_processed     5105 non-null   object
 5   tokens_count         5105 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 279.2+ KB
Val_X: 
<class 'pandas.core.frame.DataFrame'>
Index: 1702 entries, 3561 to 2448
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   document             1702 non-null   int64 
 1   full_text            1702 non-null   object
 2   tokens               1702 non-null   object
 3   trailing_whitespace  1702 non-null   object
 4   tokens_processed     1702 non-nu

In [6]:
ALLOW_LIST = []
DENY_LIST_EMAIL = []
DENY_LIST_ADDRESS = []
DENY_LIST_URL = []
DENY_LIST_NAME = []
DENY_LIST_PHONE = []
DENY_LIST_ID = []

In [7]:
all_stopwords = list(stopwords.words())
words = Counter()
for doc in preprocessed_train_df.tokens_processed:
    words.update(doc)
#Disabled for not using the lesser test records
#for doc in preprocessed_test_df.tokens_processed:
#    words.update(doc)
all_stopwords  += [str(w).lower() for w, i in words.items() if i > 55]
all_stopwords = list(sorted(set(all_stopwords)))
del words

ALLOW_LIST.extend(all_stopwords)

In [8]:
PHONE_LIST = ['phone', 'number', 'telephone', 'cell', 'cellphone',
              'mobile', 'call', 'ph', 'tel', 'mobile', 'Email']
URL_LIST = ["wikipedia", "coursera", ".pdf", ".PDF", "article",
             ".png",".gov", ".work", ".ai", ".firm", ".arts",".store",
              ".rec", ".biz", ".travel",'.ru', 'designabetterbusiness',
               '.tools', 'designorate','designresearchtechniques', 
               'ec', '.europa', 'forbes', 'google','ideas', 'trello', '.edu']

In [9]:
#"Training"
#Iterate through all of the data, identify tokens that are labels, add to allow/deny list

DENY_LIST_NAME.extend(us_names)
DENY_LIST_NAME.extend(nltk_female)
DENY_LIST_NAME.extend(nltk_male)
DENY_LIST_NAME.extend(french_dept)
DENY_LIST_NAME.extend(french_nat)

tokens = train_x['tokens'].apply(pd.Series).stack().reset_index(drop=True).tolist()
labels = train_y.apply(pd.Series).stack().reset_index(drop=True).tolist()

for i in set(labels):
    indices = [j for j in range(len(labels)) if labels[j] == i]
    if i == 'O':
        ALLOW_LIST.extend([tokens[i] for i in indices])
    if i == 'B-EMAIL':
        DENY_LIST_EMAIL.extend([tokens[i] for i in indices])
    elif i in ['B-STREET_ADDRESS', 'I-STREET_ADDRESS']:
        DENY_LIST_ADDRESS.extend([tokens[i] for i in indices])
    elif i in ['B-URL_PERSONAL', 'I-URL_PERSONAL']:
        DENY_LIST_URL.extend([tokens[i] for i in indices])
    elif i in ['B-NAME_STUDENT', 'I-NAME_STUDENT']:
    #elif i in ['I-NAME_STUDENT']:
        DENY_LIST_NAME.extend([tokens[i] for i in indices])
    elif i in ['B-PHONE_NUM', 'I-PHONE_NUM']:
        DENY_LIST_PHONE.extend([tokens[i] for i in indices])
    elif i in ['B-ID_NUM', 'I-ID_NUM']:
        DENY_LIST_ID.extend([tokens[i] for i in indices])
    else:
        continue


In [10]:
id_regex = r'([A-Za-z]{2}[.?]:)?\d{12,12}'
id_pattern = Pattern(name="id", regex=id_regex, score = 0.5)
id_recognizer = PatternRecognizer(supported_entity="ID_CUSTOM", patterns = [id_pattern])

address_regex = r'\b\d+\s+\w+(\s+\w+)*\s+((st(\.)?)|(ave(\.)?)|(cir(\.)?)|(rd(\.)?)|(blvd(\.)?)|(ln(\.)?)|(ct(\.)?)|(dr(\.)?))\b'
address_pattern = Pattern(name="address", regex=address_regex, score=0.5)
address_recognizer = PatternRecognizer(supported_entity="ADDRESS_CUSTOM", patterns = [address_pattern], context=["st", "Apt"])

email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
email_pattern = Pattern(name="email address", regex=email_regex, score=0.5)
email_recognizer = PatternRecognizer(supported_entity="EMAIL_CUSTOM", patterns = [email_pattern])

url_regex = r'((https?)|(http?)|(ftp?))://\S+|www\.\S+'
url_pattern = Pattern(name="url", regex=url_regex, score=0.5)
url_recognizer = PatternRecognizer(supported_entity="URL_CUSTOM", patterns = [url_pattern])

phone_regex = r'^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$'
phone_pattern = Pattern(name='phone', regex=phone_regex, score=0.5)
phone_recognizer = PatternRecognizer(supported_entity='PHONE_CUSTOM', patterns=[phone_pattern])

In [11]:
#Microsoft Number Recognizer to consider lettered-numbers as well
class NumbersRecognizer(EntityRecognizer):

    expected_confidence_level = 0.7  # expected confidence level for this recognizer

    def load(self) -> None:
        """No loading is required."""
        pass

    def analyze(self, text: str, entities: list[str], nlp_artifacts: NlpArtifacts) -> list[RecognizerResult]:
        """
        Analyzes test to find tokens which represent numbers (either 123 or One Two Three).
        """
        results = []

        # iterate over the spaCy tokens, and call `token.like_num`
        for token in nlp_artifacts.tokens:
            if token.like_num:
                result = RecognizerResult(
                    entity_type="NUMBER",
                    start=token.idx,
                    end=token.idx + len(token),
                    score=self.expected_confidence_level,
                )
                results.append(result)
        return results

new_numbers_recognizer = NumbersRecognizer(supported_entities=["NUMBER"])

In [12]:
configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
}
provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine = provider.create_engine()

In [13]:
dictionary = RecognizerRegistry()
dictionary.load_predefined_recognizers()
dictionary.add_recognizer(address_recognizer)
dictionary.add_recognizer(email_recognizer)
dictionary.add_recognizer(url_recognizer)
dictionary.add_recognizer(phone_recognizer)
dictionary.add_recognizer(id_recognizer)

In [14]:
analyzer = AnalyzerEngine(supported_languages=['en'],
                          registry=dictionary,
                          nlp_engine=nlp_engine,
                          context_aware_enhancer=LemmaContextAwareEnhancer(
                              context_similarity_factor=0.6,
                              min_score_with_context_similarity=0.4
                          ))

Trainnig and Testing

In [15]:
preds = []
#Testing on the split documents, as they are higher in count than the original test.json file

#test = preprocessed_test_df
test = val_x

temp = test.apply(lambda x: find_index(x), axis=1)
test['start'] = temp.apply(lambda x: x[0])
test['end'] = temp.apply(lambda x: x[1])
#pii_labels = [
#    'B-NAME_STUDENT', 'I-NAME_STUDENT',
#    'B-URL_PERSONAL', 'I-URL_PERSONAL',
#    'B-ID_NUM', 'I-ID_NUM',
#    'B-EMAIL', 'I-EMAIL',
#    'B-STREET_ADDRESS', 'I-STREET_ADDRESS',
#    'B-PHONE_NUM', 'I-PHONE_NUM',
#    'B-USERNAME', 'I-USERNAME'
#]



In [16]:
for i, d in enumerate(tqdm(test.iterrows())):
    results = analyzer.analyze(text=d[1]['full_text'],
                               entities= ["PHONE_CUSTOM", "PERSON", "URL_CUSTOM", "EMAIL_ADDRESS",
                                          "EMAIL_CUSTOM", "ADDRESS_CUSTOM", "US_SSN", "US_ITIN",
                                          "US_PASSPORT", "US_BANK_NUMBER", "USERNAME", "ID_CUSTOM"],                                         
                               allow_list=ALLOW_LIST,
                               language='en', 
                               score_threshold=0.005)
    pre_preds = []
    for r in results:
        s = find_larger(d[1]['start'], r.start)
        end = r.end
        word = d[1]['full_text'][r.start:r.end]
        end = end - count_trailing_whitespaces(word)
        temp_preds = [s]
        try:
            while d[1]['end'][s+1] <= end:
                temp_preds.append(s+1)
                s +=1
        except:
            pass
        
        tmp = False
        
        #'B-PHONE_NUM', 'I-PHONE_NUM',
        if r.entity_type in ('PHONE_CUSTOM', 'B-PHONE_NUM', 'I-PHONE_NUM'):
            if is_valid_date(word):
                continue
            for w in PHONE_LIST:
                if w in d[1]['full_text'][max(r.start-50, 0):min(r.end+50, len(d[1]['full_text']))]:
                    tmp = False
                    break
                else:
                    tmp = True 
            label =  'PHONE_NUM'
        #'B-NAME_STUDENT', 'I-NAME_STUDENT',
        if r.entity_type in ('PERSON', 'B-NAME_STUDENT', 'I-NAME_STUDENT'):
            if str(i).upper() in wikipedia:
                tmp = True
                break
            label =  'NAME_STUDENT'
        #'B-URL_PERSONAL', 'I-URL_PERSONAL',
        if r.entity_type in ('URL_CUSTOM', 'B-URL_PERSONAL', 'I-URL_PERSONAL'):
            for w in URL_LIST:
                if w in word:
                    tmp = True
                    break
            label = 'URL_PERSONAL'
        #'B-EMAIL', 'I-EMAIL', 
        if r.entity_type in ('B-EMAIL', 'I-EMAIL','EMAIL_ADDRESS', 'EMAIL_CUSTOM'):
            label = "EMAIL"
        #'B-STREET_ADDRESS', 'I-STREET_ADDRESS',
        if r.entity_type in ('B-STREET_ADDRESS', 'ADDRESS_CUSTOM', 'I-STREET_ADDRESS'):
            label = 'STREET_ADDRESS'
        #'B-ID_NUM', 'I-ID_NUM',
        if r.entity_type in ['B-ID_NUM', 'I-ID_NUM', 'US_SSN', 'US_ITIN', 'US_PASSPORT', 'US_BANK_NUMBER', 'ID_CUSTOM']:
            label = 'ID_NUM'
        #'B-USERNAME', 'I-USERNAME'
        if r.entity_type in ['B-USERNAME', 'I-USERNAME', 'USERNAME']:
            label =  'USERNAME'
        if tmp:
            continue
        for p in temp_preds:
            if len(pre_preds) > 0:
                if pre_preds[-1]['rlabel'] == r.entity_type and ((p - pre_preds[-1]['token'])==1):
                    label_f = "I-"+label
                else:
                    label_f = "B-"+label
            else:
                label_f = "B-"+label
            pre_preds.append(({
                    "document":d[1]['document'],
                    "token":p,
                    "label":label_f,
                    "rlabel":r.entity_type
                }))
    preds.extend(pre_preds)

1702it [03:37,  7.82it/s]


In [17]:
final_results = pd.DataFrame(preds).iloc[:,:-1].reset_index()
final_results.columns = ['row_id','document', 'token', 'label']

temp = val_x[['document']].join(val_y)

dictionary = temp['labels'].apply(lambda x: {'indx': list(range(len(x))), 'vals': x})
indices = dictionary.apply(lambda x: x['indx']).explode()
values = dictionary.apply(lambda x: x['vals']).explode()

ground_truth = pd.concat([indices, values], axis=1).reset_index()
ground_truth['document'] = ground_truth['index'].apply(lambda x: temp['document'][x])
ground_truth = ground_truth.drop(columns='index')
ground_truth.columns = ['token', 'label', 'document']
ground_truth = ground_truth[ground_truth['label'] != 'O']
ground_truth = ground_truth.reset_index(names=['row_id'])

In [18]:
print(pii_fbeta_score(final_results, ground_truth, 5))

Precision: 0.27905004240882103
Recall: 0.8255959849435383
F1-Score 0.20855784469096672
0.7677601759188619
