## Inference - PII NER

Eval data provided needs to be tagged and exported into an excel file.
Notes:
- Apt. xxx patterns seem to be confused with plates, can be easily fixed by adding to training data
- Some Phone_number labels are identified as Address, we might have imbalanced the classes by adding too much fake data, maybe next iteration reduce the volume of addresses or add some data for phone numbers.

In [1]:
# imports
import pandas as pd
import numpy as np

# local files
import rules
import preprocess
import datagen

from flair.data import Sentence
from flair.models import SequenceTagger

In [2]:
# load the trained model
model = SequenceTagger.load('../model/taggers/pii-ner-v1/final-model.pt')

# load the rules engine
engine = rules.RulesEngine()

2021-02-01 14:55:57,963 loading file ../model/taggers/pii-ner-v1/final-model.pt


In [3]:
# Helper function for getting the predictions

def get_predictions(text, rules_engine, model):
    '''
    In case of multiple entities, this function return the entity with
    greatest confidence
    '''
    # Preprocess text by pinning known patterns
    text = rules_engine.pin_text(text)
    sentence = Sentence(text)
    model.predict(sentence)
    result = sentence.to_dict(tag_type="ner")
    entities = [
        {
            "tagged_sentence": result.get('text'),
            "label": e.get('labels')[0].value,
            "entity": e.get('text'),
            "confidence": e.get('labels')[0].score
        }
        for e in result.get('entities')]
    
    # Sort entities in ascending order with last entity being the highest confidence
    entities = sorted(entities, key=lambda c: c['confidence'])

    if entities:
        # Take last
        return entities[-1]
    else:
        # Label = None
        return {
            "label": "None", "entity": "", 
            "confidence": 0.0 , "tagged_sentence":result.get('text')
        }

# Example invocation
get_predictions(
    "Accept ready address firm continue phone camera. Discussion party party one. Worker voice foreign general everything yeah travel. Experience +1-821-995-3406x094 through somebody benefit.",
    engine, model
)

{'tagged_sentence': 'Accept ready address firm continue phone camera. Discussion party party one. Worker voice foreign general everything yeah travel. Experience ppppp +1-821-995-3406x094 hhhhh through somebody benefit.',
 'label': 'Phone_number',
 'entity': '1-821-995-3406x094',
 'confidence': 0.9986434578895569}

In [4]:
# Load the eval data
eval_data = pd.read_excel("../data/PII_Train_Large_Data_Test_Data.xlsx",sheet_name="PII Test Data - PII Test Data", skiprows=1, index_col=None, na_values=['NA'], usecols = "A,B,C")
eval_data.head()

Unnamed: 0,Text,Label,PII
0,Term although process suddenly parent. Poor go...,,
1,"356 Collins Highway New Kathleen, NM 10160 Rem...",,
2,Appear job opportunity job. Piece 405 Callahan...,,
3,During half leave simple west lose piece 859 D...,,
4,Peace when Apt. 910 enter left speak agree. Le...,,


In [None]:
# perform inference and save result
predictions = []
for index, row in eval_data.iterrows():
    pred = get_predictions(row['Text'], engine, model)
    pred['Text'] = row['Text']
    predictions.append(
        {
            "Label": pred['label'],
            "PII": pred['entity']
        })

In [None]:
pd.DataFrame(predictions).to_excel("../data/PII_Predictions_v1.xlsx", columns=['Text','Label','PII'])