# FLANER model evaluation

### Config

In [1]:
CHECKPOINT_PATH = '/home/ubuntu/gil/flanner_trainer/models/v3/batch_size_64/checkpoint-897'
TEST_LABELED_DATA_PATH = '/home/ubuntu/gil/flanner_trainer/val_prompt_dataset_final_corrected15.csv'
TEST_SET_PREDICTIONS_PATH = '/home/ubuntu/gil/flanner_trainer/'
TASK_PREFIX = 'ner_last_name:'

In [2]:
DATA_CLASSES_TAGS = {
    "LN": "last_name"
}

GROUND_TRUTH_DATA_CLASSES_TAGS = {
    "last_name": "last_name"  

}

In [3]:
ENTITY_DELIMITER = " #### "
VALUES_DELIMITER = ","

In [4]:
from model_config import *

### Dependencies

In [5]:
import pandas as pd
import numpy as np
import torch
import re
from transformers import pipeline
from sklearn.metrics import classification_report


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from torch.utils.data import Dataset
from tqdm.auto import tqdm

class ListDataset(Dataset):
    """
    https://discuss.huggingface.co/t/progress-bar-for-hf-pipelines/20498/2
    """
    def __init__(self, original_list):
        self.original_list = original_list
    def __len__(self):
        return len(self.original_list)
    def __getitem__(self, i):
        return self.original_list[i]

# Model Evaluation

### Load model

In [7]:
# need to move the tokenizer before
model_pipeline = pipeline(
    task='text2text-generation',
    model=CHECKPOINT_PATH,
    tokenizer=CHECKPOINT_PATH,
    device=0 if torch.cuda.is_available() else -1,
    torch_dtype=torch.bfloat16
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.79s/it]


### Load test dataset

In [8]:
df = pd.read_csv(TEST_LABELED_DATA_PATH)
df.shape

(1196, 29)

In [9]:
df.sample(5)

Unnamed: 0,ROW_NUM,TOTAL_TENANTS,annotation_id,annotator,context,created_at,fc_uid,id,industry,isMock,...,tokenization,updated_at,useDataForResearch,verified,vmc_uid,full_name,first_name,last_name,maiden_name,middle_name
320,4,160,47150,25,15829892\nDate of issue: 02/01/2021\nSeller: C...,2024-10-28T14:46:30.457792Z,c6526920-38b7-40c7-80a4-dfb2aca50a24,144921,Other,False,...,Plain,2024-10-28T14:46:30.457812Z,True,,518ce260-a56d-4fb2-b393-f26102e021fa,[],[],[],[],[]
611,3,160,43048,25,BLVD|||ORCHARD PARK|NY|US|14127|7166626500|Fac...,2024-10-14T09:01:21.979874Z,147862f2-501e-487c-a18a-2444a4a09099,139701,Energy,False,...,Plain,2024-10-14T09:01:21.979896Z,True,,72250f71-5178-4dca-b431-285906c86653,[],[],[],[],[]
690,2,160,43190,25,Complete:\nPS Material Request #:\n\n\n\nPS Re...,2024-10-14T09:40:42.105803Z,f98e1c15-c40b-4caf-ac24-759b42a50785,139823,Consumer,False,...,Plain,2024-10-14T09:40:42.105832Z,True,,c6110f71-73d0-4d18-b210-96d70ce6e8c9,['Sampath Kumar'],[],[],[],[]
29,5,160,46858,25,"162,051.35 78,207.00 1,266,811.15 ...",2024-10-28T11:50:35.565295Z,d0f76fd1-089d-4fa6-bdaa-5047e5308c29,144629,Insurance,False,...,Plain,2024-10-28T11:50:35.565323Z,True,,14b33344-5350-414c-94ba-daea6e57640d,"['Lombardi, Cindy']",[],[],[],[]
656,1,192,43125,25,Núm. 736\n\n\nISBN: 978-970-10-7027-7\n\n\nTra...,2024-10-14T09:18:18.588750Z,d0d8a3cb-a192-4081-9ee8-da32c4d3b0dd,139765,Call Centers & Business Centers,False,...,Plain,2024-10-14T09:18:18.588773Z,True,,f210700a-62aa-4418-bc15-4e27db20953b,[],[],[],[],[]


In [10]:
CONTROL_CHARS_PATTERN = re.compile("[\u0000-\u0008\u000b\u000c\u000e-\u001f\u007f-\u00a0]+")

def clean_text(context):
    context = str(context)
    context = context.replace('\n', ' ')
    context = context.replace('\t', ' ')
    context = context.replace('\\r\\n', ' ')
    context = CONTROL_CHARS_PATTERN.sub(' ', context).strip()
    context = re.sub(' +', ' ', context)
    return context


In [11]:
# text has been cleaned prior to data tagging

df['clean_context'] = df['context'].apply(lambda x: clean_text(x))

In [12]:
model_inputs = df['clean_context'].to_list()

In [13]:
model_inputs[0:5]

['Client: Hall-Johnston Walker, Rojas and Wells',
 'C12004000197OP010412Brooklyn NY2131WVitaliy Gavrylyuk WCLVitaliy Gavrylyuk 11218X',
 'Sent: Tuesday, April 01, 2014 10:32 AM To: OGrady.Elizabeth; Agencyauthos',
 'Statement Period Feb 1 - Feb 28, 2021 Relationship Manager Cabrera,Ana',
 'this proponent again? Yes, currently working with Ellis Don on construction a $370+M DBF']

In [14]:
# add task prefix

model_inputs = [TASK_PREFIX+x for x in model_inputs]

In [15]:
model_inputs[0:5]

['ner_last_name:Client: Hall-Johnston Walker, Rojas and Wells',
 'ner_last_name:C12004000197OP010412Brooklyn NY2131WVitaliy Gavrylyuk WCLVitaliy Gavrylyuk 11218X',
 'ner_last_name:Sent: Tuesday, April 01, 2014 10:32 AM To: OGrady.Elizabeth; Agencyauthos',
 'ner_last_name:Statement Period Feb 1 - Feb 28, 2021 Relationship Manager Cabrera,Ana',
 'ner_last_name:this proponent again? Yes, currently working with Ellis Don on construction a $370+M DBF']

### Run text generation on test set

In [16]:
BATCH_SIZE = 32
GENERATION_MAX_LEN = 100

In [17]:
# convert to ListDataset to see progress bar with tqdm
dataset = ListDataset(model_inputs)

In [18]:
# model_outputs = model_pipeline(
#     model_inputs,
#     batch_size=BATCH_SIZE,
#     max_length=GENERATION_MAX_LEN
# )

model_outputs = []
for out in tqdm(model_pipeline(dataset, batch_size=BATCH_SIZE, max_new_tokens=GENERATION_MAX_LEN)):
     model_outputs.append(out[0])

1196it [00:48, 24.54it/s]                      


In [19]:
df['model_generated_text'] = [output['generated_text'] for output in model_outputs]

### Parse model output

In [20]:
# this is the function you should use instead of the one in the notebook
def model_output_str_to_dict(label_str: str) -> dict:

    entity_dict = dict()

    if label_str in (['[]', np.nan]):
        return entity_dict
    # split classes
    classes_list = label_str.split(ENTITY_DELIMITER)
    # parse each class to a dictionary: {class name: [class values]}
    for class_str in classes_list:
        try:
            key, value = class_str.split(":", 1)
            if value.strip() == "": 
                value = "[]"
            if value.strip()[-1] != "]":
                value = value.replace(ENTITY_DELIMITER.strip(), " ").strip() 
                if value[-1] != "]":
                    value = value + "]"
            entity_dict[key.strip()] = eval(value.strip())
        except:
            print(label_str)
            continue
    return entity_dict


In [21]:
df['entity_dict'] = df['model_generated_text'].apply(
    lambda label_str: model_output_str_to_dict(label_str))


LN:['Haralambous', 'Hitchcock', 'Hitchcock', 'Hitchcock', 'Hitchcock', 'Hitchcock', 'Hitchcock', 'Hitchcock', 'Hitchcock', 'Hitchcock', 'Hitchcock', 'Hitchcock', 'Hitchcock', 'Hitch


In [23]:
df.sample(10)[['model_generated_text', 'entity_dict']]

Unnamed: 0,model_generated_text,entity_dict
607,LN:['Fienup'],{'LN': ['Fienup']}
1091,LN:[],{'LN': []}
190,"LN:['Roth', 'Rogers', 'Meadows', 'Durham', 'Ba...","{'LN': ['Roth', 'Rogers', 'Meadows', 'Durham',..."
266,LN:[],{'LN': []}
1117,LN:[],{'LN': []}
58,LN:[],{'LN': []}
636,LN:[],{'LN': []}
1100,LN:[],{'LN': []}
869,LN:[],{'LN': []}
1189,"LN:['Kirchner', 'Syed']","{'LN': ['Kirchner', 'Syed']}"


In [24]:
df[df['entity_dict'] != {}][['model_generated_text', 'entity_dict']]

Unnamed: 0,model_generated_text,entity_dict
0,"LN:['Walker', 'Rojas', 'Wells']","{'LN': ['Walker', 'Rojas', 'Wells']}"
1,LN:[],{'LN': []}
2,LN:[],{'LN': []}
3,LN:['Cabrera'],{'LN': ['Cabrera']}
4,LN:[],{'LN': []}
...,...,...
1191,LN:[],{'LN': []}
1192,LN:[],{'LN': []}
1193,LN:[],{'LN': []}
1194,LN:[],{'LN': []}


In [25]:
DATA_CLASSES_TAGS

{'LN': 'last_name'}

In [26]:
# create columns per class with matched text

def parse_classes_matches_from_dict(
    df: pd.DataFrame,
    data_classes: list = DATA_CLASSES_TAGS.keys(),
) -> pd.DataFrame:

    for data_class in data_classes:
        df[f'{data_class}_model'] = df['entity_dict'].apply(lambda d: d.get(data_class, np.nan))

    return df


In [27]:
df = parse_classes_matches_from_dict(
    df=df,
    data_classes=DATA_CLASSES_TAGS.keys(),
)


In [29]:
df.head(5)

Unnamed: 0,ROW_NUM,TOTAL_TENANTS,annotation_id,annotator,context,created_at,fc_uid,id,industry,isMock,...,vmc_uid,full_name,first_name,last_name,maiden_name,middle_name,clean_context,model_generated_text,entity_dict,LN_model
0,1,160,46829,25,Client:\n Hal...,2024-10-28T11:25:12.139476Z,1630c088-5024-440b-a90b-c58dfd29dd21,144600,Software & Internet,False,...,38761ba2-f72a-4831-bdb0-ffbb20a22ba4,[],[],"['Hall-Johnston', 'Walker', 'Rojas', 'Wells']",[],[],"Client: Hall-Johnston Walker, Rojas and Wells","LN:['Walker', 'Rojas', 'Wells']","{'LN': ['Walker', 'Rojas', 'Wells']}","[Walker, Rojas, Wells]"
1,2,160,46830,25,C12004000197OP010412Brooklyn NY...,2024-10-28T11:26:33.361343Z,d2e4e0a0-bd86-4873-851b-4a43984e48d3,144601,Software & Internet,False,...,c9c71dee-82e7-4248-9aed-e0be4d1e46ee,['Vitaliy Gavrylyuk'],[],[],[],[],C12004000197OP010412Brooklyn NY2131WVitaliy Ga...,LN:[],{'LN': []},[]
2,1,160,46831,25,"Sent: Tuesday, April 01...",2024-10-28T11:27:40.128534Z,88bb02e1-2751-43e6-aa6a-12850225cab4,144602,Data Collection & Internet Portals,False,...,31c79119-b5f3-43b6-b411-cac2a20ed509,['OGrady.Elizabeth'],[],[],[],[],"Sent: Tuesday, April 01, 2014 10:32 AM To: OGr...",LN:[],{'LN': []},[]
3,1,160,46832,25,"Statement Period\nFeb 1 - Feb 28, 2021\nRelati...",2024-10-28T11:27:55.355294Z,87439ba6-cd05-4507-aed9-7300efe40709,144603,professional training coaching,False,...,93a172c4-b5a8-4409-9010-230b3ac80799,"['Cabrera,Ana']",[],[],[],[],"Statement Period Feb 1 - Feb 28, 2021 Relation...",LN:['Cabrera'],{'LN': ['Cabrera']},[Cabrera]
4,2,192,46833,25,"this proponent again?\n\nYes, currently workin...",2024-10-28T11:28:04.388250Z,f6263e4c-4d86-4cd0-8304-3d16e9f028c3,144604,Healthcare,False,...,da4defd2-c747-4614-915b-ec31caa0a0f4,[],[],[],[],[],"this proponent again? Yes, currently working w...",LN:[],{'LN': []},[]


In [38]:
# replace nan values
for data_class in DATA_CLASSES_TAGS.keys():
    df[data_class + "_model"] = df[data_class + "_model"].apply(lambda x: x if isinstance(x, list) else [])


In [69]:
def add_entities_counts(
    df: pd.DataFrame,
    data_classes: list = DATA_CLASSES_TAGS.keys(),
    col_suffix = "_model",
    entity_count_suffix="_model"
) -> pd.DataFrame:
    
    # count each entity's appearances
    for data_class in data_classes:
        df[f'{data_class}_count{col_suffix}'] = df[f'{data_class}{col_suffix}'].apply(lambda l: len(set(l)) if isinstance(l, list) else len(eval(l)))
        
    # count total entities
    class_count_cols = [f'{data_class}_count{col_suffix}' for data_class in data_classes]
    df[f'entity_count{entity_count_suffix}'] = df[class_count_cols].sum(axis=1)
    df[f'entity_types_count{entity_count_suffix}'] = df[class_count_cols].clip(upper=1).sum(axis=1)

    return df
    

In [70]:
df = add_entities_counts(
    df=df,
    data_classes=DATA_CLASSES_TAGS.keys(),
)

In [47]:
df = add_entities_counts(
    df=df,
    data_classes=GROUND_TRUTH_DATA_CLASSES_TAGS.keys(),
    col_suffix="",
    entity_count_suffix=""
)

In [71]:
df[(df['LN_count_model'] == 0) & (df['last_name_count'] > 0)][
    ['clean_context', 'last_name', 'LN_model', 'LN_count_model', 'last_name_count']
]

Unnamed: 0,clean_context,last_name,LN_model,LN_count_model,last_name_count
37,Jones-Beck 38147 Robinson Knoll Suite 402 4479...,['Jones-Beck'],[],0,1
45,Invoice no: 30663418 Date of issue: 06/29/2019...,"['Wilson', 'Wright', 'Villarreal']",[],0,3
53,Rosenberg $125.00 $125.00 Organization Aditya ...,['Rosenberg'],[],0,1
66,SEED_EMAIL=ckinsella@tines.io SEED_FIRST_NAME=...,['Kinsella'],[],0,1
89,"""ParentAffiliate"": null, ""FirstName"": ""Emily"",...",['Daelo'],[],0,1
92,Extended-B 1E800..1E8DF; Mende Kikakui 1E900.....,['Adlam'],[],0,1
200,"Haralambous, Greg Hitchcock,\r Jeremie Hornus,...",['Haralambous'],[],0,1
259,Welcome to the Superhero Talent Agency Family!...,['Epic'],[],0,1
262,Gross and Rivera 286 Marilyn Crescent 299 Mega...,"['Gross', 'Rivera']",[],0,2
263,02-01-2018 11-01-2018 CHRIST MEDICAL (PART B) ...,['Rice'],[],0,1


### Evaluation per class

In [76]:
df.sort_values([f'{data_class}_count_model'], ascending=[False]).head(2)

Unnamed: 0,ROW_NUM,TOTAL_TENANTS,annotation_id,annotator,context,created_at,fc_uid,id,industry,isMock,...,clean_context,model_generated_text,entity_dict,LN_model,LN_count_model,entity_count_model,entity_types_count_model,last_name_count,entity_count,entity_types_count
110,5,160,46939,25,"From: Weinand, Mariane\r\nSent: Fri Apr 19 01:...",2024-10-28T12:23:53.105909Z,b127cd0b-9162-4138-9896-2985f9a5b354,144710,Insurance,False,...,"From: Weinand, Mariane\r Sent: Fri Apr 19 01:0...","LN:['Weinand', 'Lee', 'Rowe', 'Danzis', 'Nilfo...","{'LN': ['Weinand', 'Lee', 'Rowe', 'Danzis', 'N...","[Weinand, Lee, Rowe, Danzis, Nilforoshan, Corc...",7,7,1,0,0,0
395,3,160,47225,25,http://www.smart-cities.eu/model.html. 05/09/2...,2024-10-28T15:03:59.793323Z,b926d496-4a15-4df3-87ef-ccedced3d26c,144996,Software & Internet,False,...,http://www.smart-cities.eu/model.html. 05/09/2...,"LN:['Al-Hader', 'Rodzi', 'Sharif', 'Ahmad', 'S...","{'LN': ['Al-Hader', 'Rodzi', 'Sharif', 'Ahmad'...","[Al-Hader, Rodzi, Sharif, Ahmad, Smart Fan, Ha...",7,7,1,7,7,1


In [77]:
for data_class in GROUND_TRUTH_DATA_CLASSES_TAGS.keys():
    df[f'{data_class}_count_any'] = df[f'{data_class}_count'].apply(lambda x: 1 if x > 0 else x)
    if max(df[f'{data_class}_count_any'].tolist()) != 1:
        raise ValueError(f"Any count for {data_class} is not correct")

In [78]:
for model_data_class in DATA_CLASSES_TAGS.keys():
    df[f'{model_data_class}_count_model_any'] = df[f'{model_data_class}_count_model'].apply(lambda x: 1 if x > 0 else x)
    if max(df[f'{model_data_class}_count_model_any'].tolist()) != 1:
        raise ValueError(f"Any count for {model_data_class} is not correct")

In [80]:
data_class = "LN"

In [81]:
print(classification_report(
    y_true=df[f'{DATA_CLASSES_TAGS[data_class]}_count_any'], 
    y_pred=df[f'{data_class}_count_model_any']
))

              precision    recall  f1-score   support

           0       0.97      0.78      0.87      1095
           1       0.24      0.74      0.36       101

    accuracy                           0.78      1196
   macro avg       0.61      0.76      0.61      1196
weighted avg       0.91      0.78      0.82      1196

