# Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json

In [3]:
import sys
sys.path.insert(0, '..')

# Constants

In [4]:
VERSION = '20240706095040'

# Load holdout label studio data

In [5]:
HOLDOUT_LABEL_STUDIO_FP = '../data/output/llm_extract_output_holdout_label_studio.json'
with open(HOLDOUT_LABEL_STUDIO_FP, 'r') as f:
    holdout_raw = json.load(f)

In [6]:
holdout_raw[:5]

[{'id': 996,
  'annotations': [{'id': 15,
    'completed_by': 1,
    'result': [{'value': {'start': 15,
       'end': 51,
       'text': 'looks out over beautiful green lawns',
       'labels': ['VIEW']},
      'id': '780b56b2-376f-4afb-b7ac-df001a0ae76e',
      'from_name': 'label',
      'to_name': 'text',
      'type': 'labels',
      'origin': 'prediction'},
     {'value': {'start': 59,
       'end': 97,
       'text': 'Hudson River and the Statue of Liberty',
       'labels': ['VIEW']},
      'id': 'bc96d157-1925-4b9a-8d5b-7a420b4fede2',
      'from_name': 'label',
      'to_name': 'text',
      'type': 'labels',
      'origin': 'prediction'}],
    'was_cancelled': False,
    'ground_truth': False,
    'created_at': '2024-07-09T03:42:06.515569Z',
    'updated_at': '2024-07-09T03:42:06.515591Z',
    'draft_created_at': None,
    'lead_time': 25.821,
    'prediction': {'id': 3091,
     'model_version': 'one',
     'created_ago': '0\xa0minutes',
     'result': [{'id': '780b56b2-376f-

In [7]:
holdout_raw[0]

{'id': 996,
 'annotations': [{'id': 15,
   'completed_by': 1,
   'result': [{'value': {'start': 15,
      'end': 51,
      'text': 'looks out over beautiful green lawns',
      'labels': ['VIEW']},
     'id': '780b56b2-376f-4afb-b7ac-df001a0ae76e',
     'from_name': 'label',
     'to_name': 'text',
     'type': 'labels',
     'origin': 'prediction'},
    {'value': {'start': 59,
      'end': 97,
      'text': 'Hudson River and the Statue of Liberty',
      'labels': ['VIEW']},
     'id': 'bc96d157-1925-4b9a-8d5b-7a420b4fede2',
     'from_name': 'label',
     'to_name': 'text',
     'type': 'labels',
     'origin': 'prediction'}],
   'was_cancelled': False,
   'ground_truth': False,
   'created_at': '2024-07-09T03:42:06.515569Z',
   'updated_at': '2024-07-09T03:42:06.515591Z',
   'draft_created_at': None,
   'lead_time': 25.821,
   'prediction': {'id': 3091,
    'model_version': 'one',
    'created_ago': '0\xa0minutes',
    'result': [{'id': '780b56b2-376f-4afb-b7ac-df001a0ae76e',
      

# Load all data

In [10]:
LLM_OUTPUT_REWRITE_FP = f'../data/output/llm_extract_output_{VERSION}_rewrited.json'

with open(LLM_OUTPUT_REWRITE_FP, 'r') as f:
    llm_output = json.load(f)

In [11]:
# Unnest the record since each record contains 10 items (each llm prompt contains 10 inputs)
llm_output_raw = []
for record in llm_output:
    llm_output_raw.extend(list(record.values()))

In [12]:
llm_output_raw[:5]

[{'text': 'The service is not consistently excellent -- just decent.',
  'entities': [['service is not consistently excellent', 'SERVICE', 0.4, -0.3],
   ['just decent', 'SERVICE', 0.5, 0.2]]},
 {'text': "I went with 5 friends and we lingered at the table for a bit and didn't feel rushed at all even though there was a wait.",
  'entities': [["didn't feel rushed", 'AMBIENCE', 0.6, 0.4]]},
 {'text': 'Food was very good as well, considering that we tried the budget selection (though I wish the pork belly that I ordered was roasted a bit longer, so that fat was more of a melt-in-your-mouth experience).',
  'entities': [['food was very good', 'FOOD', 0.7, 0.5],
   ['pork belly that I ordered', 'FOOD', 0.6, -0.2]]},
 {'text': 'It is about FOOD and Ambiance, and imagine how dreadful it will be if we only had to listen to an idle engine.',
  'entities': [['about FOOD and Ambiance', 'AMBIENCE', 0.5, 0.3],
   ['listen to an idle engine', 'AMBIENCE', 0.4, -0.6]]},
 {'text': 'The bartender on my m

# Convert holdout label studio format to SpaCy JSON format

In [13]:
from src.convert_ner_format.label_studio_to_spacy import convert_label_studio_to_spacy_format

In [14]:
holdout_spacy = convert_label_studio_to_spacy_format(holdout_raw)

In [15]:
holdout_spacy[:5]

[{'id': '53745975096161334446695170010345366844',
  'text': 'The restaurant looks out over beautiful green lawns to the Hudson River and the Statue of Liberty.',
  'label': [[15, 51, 'VIEW'], [59, 97, 'VIEW']],
  'Comments': []},
 {'id': '14008356808251230170256576349676573112',
  'text': "Also, the sandwiches (nearing $7) didn't come with anything like chips or a side.",
  'label': [[10, 33, 'PRICE'], [10, 80, 'FOOD']],
  'Comments': []},
 {'id': '196846222781357184059657962635349503302',
  'text': 'Luckily we saved room for the BBQ Salmon, Sea Bass and Crispy Duck.',
  'label': [[30, 66, 'FOOD']],
  'Comments': []},
 {'id': '122902927917699489714278789756481778575',
  'text': "I've been to Naples 45 for dinner twice.",
  'label': [[0, 39, 'SERVICE']],
  'Comments': []},
 {'id': '209028716300947491053334478581226250868',
  'text': 'Haru serves very fresh fish, has a trendy, modern ambiance, prime location on Park Avenue South and friendly service.',
  'label': [[5, 27, 'FOOD'],
   [35

In [16]:
# Persist
holdout_spacy_fp = "../data/output/llm_extract_output_holdout_spacy.json"
with open(holdout_spacy_fp, "w") as f:
    json.dump(holdout_spacy, f)

# Convert LLM Output to Spacy JSON format

In [17]:
from src.convert_ner_format.llm_output_to_spacy import convert_from_llm_output_to_spacy

In [18]:
llm_output_spacy = convert_from_llm_output_to_spacy(llm_output_raw)

[32m2024-07-09 10:57:51.400[0m | [1mINFO    [0m | [36msrc.utils.time.timer[0m:[36mtimed[0m:[36m23[0m - [1mconvert_from_llm_output_to_spacy runtime: 0.007s[0m


In [19]:
len(llm_output_spacy)

1998

In [20]:
llm_output_spacy[:5]

[{'id': '251698800132815913556804367643027318268',
  'text': 'The service is not consistently excellent -- just decent.',
  'label': [[4, 41, 'SERVICE'], [45, 56, 'SERVICE']],
  'Comments': []},
 {'id': '220536303835705033511694647871179001291',
  'text': "I went with 5 friends and we lingered at the table for a bit and didn't feel rushed at all even though there was a wait.",
  'label': [[65, 83, 'AMBIENCE']],
  'Comments': []},
 {'id': '159991645213750204725390416162905872638',
  'text': 'Food was very good as well, considering that we tried the budget selection (though I wish the pork belly that I ordered was roasted a bit longer, so that fat was more of a melt-in-your-mouth experience).',
  'label': [[94, 119, 'FOOD']],
  'Comments': []},
 {'id': '129042515009119996187744609637264735274',
  'text': 'It is about FOOD and Ambiance, and imagine how dreadful it will be if we only had to listen to an idle engine.',
  'label': [[6, 29, 'AMBIENCE'], [85, 109, 'AMBIENCE']],
  'Comments': [

In [21]:
# Persist
llm_output_spacy_fp = f"../data/output/llm_extract_output_{VERSION}_spacy.json"
with open(llm_output_spacy_fp, "w") as f:
    json.dump(llm_output_spacy, f)