# Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json

In [3]:
import sys
sys.path.insert(0, '..')

from loguru import logger

# Constants

In [4]:
VERSION = '20240706095040'

# Load data

In [5]:
# llm output
llm_output_spacy_fp = f"../data/output/llm_extract_output_{VERSION}_spacy.json"
with open(llm_output_spacy_fp, "r") as f:
    llm_output = json.load(f)

In [6]:
# holdout
holdout_spacy_fp = "../data/output/llm_extract_output_holdout_spacy.json"
with open(holdout_spacy_fp, "r") as f:
    holdout = json.load(f)

# Exclude holdout from llm output

In [7]:
holdout

[{'id': '53745975096161334446695170010345366844',
  'text': 'The restaurant looks out over beautiful green lawns to the Hudson River and the Statue of Liberty.',
  'label': [[15, 51, 'VIEW'], [59, 97, 'VIEW']],
  'Comments': []},
 {'id': '14008356808251230170256576349676573112',
  'text': "Also, the sandwiches (nearing $7) didn't come with anything like chips or a side.",
  'label': [[10, 33, 'PRICE'], [10, 80, 'FOOD']],
  'Comments': []},
 {'id': '196846222781357184059657962635349503302',
  'text': 'Luckily we saved room for the BBQ Salmon, Sea Bass and Crispy Duck.',
  'label': [[30, 66, 'FOOD']],
  'Comments': []},
 {'id': '122902927917699489714278789756481778575',
  'text': "I've been to Naples 45 for dinner twice.",
  'label': [[0, 39, 'SERVICE']],
  'Comments': []},
 {'id': '209028716300947491053334478581226250868',
  'text': 'Haru serves very fresh fish, has a trendy, modern ambiance, prime location on Park Avenue South and friendly service.',
  'label': [[5, 27, 'FOOD'],
   [35

In [8]:
holdout_texts = set([e['text'] for e in holdout])
llm_output_exc = [e for e in llm_output if e['text'] not in holdout_texts]
assert len(llm_output_exc) == (len(llm_output) - len(holdout))

# Convert to IOB2

In [9]:
from src.convert_ner_format.spacy_to_iob2 import convert_from_spacy_to_iob2

In [10]:
holdout_iob2 = convert_from_spacy_to_iob2(holdout)
llm_output_exc_iob2 = convert_from_spacy_to_iob2(llm_output_exc)

[32m2024-07-09 10:58:17.226[0m | [1mINFO    [0m | [36msrc.utils.time.timer[0m:[36mtimed[0m:[36m23[0m - [1mconvert_from_spacy_to_iob2 runtime: 0.043s[0m
[32m2024-07-09 10:58:23.701[0m | [1mINFO    [0m | [36msrc.utils.time.timer[0m:[36mtimed[0m:[36m23[0m - [1mconvert_from_spacy_to_iob2 runtime: 6.474s[0m


In [11]:
logger.info(f"{len(holdout_iob2)=}, {len(llm_output_exc_iob2)=}")

[32m2024-07-09 10:58:23.716[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mlen(holdout_iob2)=10, len(llm_output_exc_iob2)=1988[0m


# Add metadata

In [12]:
from src.convert_ner_format.spacy_to_iob2 import add_metadata

In [13]:
holdout_iob2_conll = add_metadata(holdout_iob2, holdout)
llm_output_exc_iob2_conll = add_metadata(llm_output_exc_iob2, llm_output_exc)

# Convert Tags to Int

In [14]:
from src.convert_ner_format.spacy_to_iob2 import build_ner_tags_label
from pprint import pprint

In [15]:
ner_tags_label = build_ner_tags_label(llm_output_exc_iob2_conll)
ner_tags_label_mapper = {i: v for i, v in enumerate(ner_tags_label)}
ner_tags_label_reverse_mapper = {v: i for i, v in enumerate(ner_tags_label)}

print("ner_tags_label_mapper:")
pprint(ner_tags_label_mapper)

print("ner_tags_label_reverse_mapper:")
pprint(ner_tags_label_reverse_mapper)

ner_tags_label_mapper:
{0: 'O',
 1: 'B-AMBIENCE',
 2: 'I-AMBIENCE',
 3: 'B-BEVERAGE',
 4: 'I-BEVERAGE',
 5: 'B-FOOD',
 6: 'I-FOOD',
 7: 'B-LOCATION',
 8: 'I-LOCATION',
 9: 'B-OVERALL',
 10: 'I-OVERALL',
 11: 'B-PRICE',
 12: 'I-PRICE',
 13: 'B-SERVICE',
 14: 'I-SERVICE',
 15: 'B-STAFF',
 16: 'I-STAFF',
 17: 'B-VALUE',
 18: 'I-VALUE',
 19: 'B-VIEW',
 20: 'I-VIEW'}
ner_tags_label_reverse_mapper:
{'B-AMBIENCE': 1,
 'B-BEVERAGE': 3,
 'B-FOOD': 5,
 'B-LOCATION': 7,
 'B-OVERALL': 9,
 'B-PRICE': 11,
 'B-SERVICE': 13,
 'B-STAFF': 15,
 'B-VALUE': 17,
 'B-VIEW': 19,
 'I-AMBIENCE': 2,
 'I-BEVERAGE': 4,
 'I-FOOD': 6,
 'I-LOCATION': 8,
 'I-OVERALL': 10,
 'I-PRICE': 12,
 'I-SERVICE': 14,
 'I-STAFF': 16,
 'I-VALUE': 18,
 'I-VIEW': 20,
 'O': 0}
