# Set up

In [1]:
import json

# Load extraction results

In [2]:
LLM_ASPECTS_EXTRACT_FP = '../data/output/llm_extract_output_20240706095040.jsonl'

In [3]:
fp = LLM_ASPECTS_EXTRACT_FP
with open(fp, 'r') as f:
    results = [json.loads(jline) for jline in f.readlines()]

# Sample 100

In [6]:
import numpy as np
np.random.seed(41)

sampled = np.random.choice(results, 100)
sampled_extracted = [record['record']['extra']['llm_extracted'] for record in sampled]
sampled_extracted

[{'11': {'text': 'The restaurant looks out over beautiful green lawns to the Hudson River and the Statue of Liberty.',
   'entities': [['looks out over beautiful green lawns', 'VIEW', 0.7, 0.5],
    ['Hudson River and the Statue of Liberty', 'VIEW', 0.8, 0.6]]},
  '12': {'text': "Also, the sandwiches (nearing $7) didn't come with anything like chips or a side.",
   'entities': [["sandwiches didn't come with anything", 'FOOD', 0.5, -0.4],
    ['chips or a side', 'FOOD', 0.6, -0.3]]},
  '13': {'text': 'Luckily we saved room for the BBQ Salmon, Sea Bass and Crispy Duck.',
   'entities': [['BBQ Salmon, Sea Bass and Crispy Duck', 'FOOD', 0.8, 0.5]]},
  '14': {'text': "I've been to Naples 45 for dinner twice.",
   'entities': [['been to Naples 45 for dinner', 'SERVICE', 0.4, 0.2],
    ['twice', 'SERVICE', 0.3, 0.1]]},
  '15': {'text': 'Haru serves very fresh fish, has a trendy, modern ambiance, prime location on Park Avenue South and friendly service.',
   'entities': [['serves very fresh fi

# Persist sampled

In [10]:
PERSIST_FP = '../data/output/llm_extract_output_20240706095040_100_sample.json'

fp = PERSIST_FP
with open(fp, 'w') as f:
    json.dump(sampled_extracted, f)

# Convert data to Label Studio format

In [24]:
import json
import uuid

# Sample source format data
source_data = [
    {
        "11": {
            "text": "The restaurant looks out over beautiful green lawns to the Hudson River and the Statue of Liberty.",
            "entities": [
                [
                    "looks out over beautiful green lawns",
                    "VIEW",
                    0.7,
                    0.5
                ],
                [
                    "Hudson River and the Statue of Liberty",
                    "VIEW",
                    0.8,
                    0.6
                ]
            ]
        }
    }
]

def find_entity_positions(text, entity_text):
    start = text.find(entity_text)
    if start == -1:
        return None, None
    end = start + len(entity_text)
    return start, end

def convert_format(source_data):
    target_data = []
    
    for item in source_data:
        for key, value in item.items():
            text = value["text"]
            entities = value["entities"]
            predictions = []
            scores = []

            for entity in entities:
                entity_text, entity_label, entity_score1, entity_score2 = entity
                start, end = find_entity_positions(text, entity_text)
                
                if start is not None and end is not None:
                    prediction = {
                        "id": str(uuid.uuid4()),
                        "from_name": "label",
                        "to_name": "text",
                        "type": "labels",
                        "value": {
                            "start": start,
                            "end": end,
                            "score": entity_score1,
                            "text": entity_text,
                            "labels": [
                                entity_label
                            ]
                        }
                    }
                    predictions.append(prediction)
                    scores.append(entity_score1)

            avg_score = np.mean(scores) if scores else 0.0
            target_item = {
                "data": {
                    "text": text
                },
                "predictions": [
                    {
                        "model_version": "one",
                        "score": avg_score,
                        "result": predictions
                    }
                ]
            }
            target_data.append(target_item)
    
    return target_data

# Convert the source data
converted_data = convert_format(source_data)

# Print the converted data in target format
print(json.dumps(converted_data, indent=2))

[
  {
    "data": {
      "text": "The restaurant looks out over beautiful green lawns to the Hudson River and the Statue of Liberty."
    },
    "predictions": [
      {
        "model_version": "one",
        "score": 0.75,
        "result": [
          {
            "id": "6c6883c6-4fa5-406f-8545-f3bf67872aab",
            "from_name": "label",
            "to_name": "text",
            "type": "labels",
            "value": {
              "start": 15,
              "end": 51,
              "score": 0.7,
              "text": "looks out over beautiful green lawns",
              "labels": [
                "VIEW"
              ]
            }
          },
          {
            "id": "94c4938c-2af4-4a08-9391-da4fcfaf79bd",
            "from_name": "label",
            "to_name": "text",
            "type": "labels",
            "value": {
              "start": 59,
              "end": 97,
              "score": 0.8,
              "text": "Hudson River and the Statue of Liberty",


In [25]:
source_data = sampled_extracted

# Convert the source data
converted_data = convert_format(source_data)

In [26]:
len(converted_data)

994

# Persist sampled Label Studio data

In [27]:
PERSIST_FP = '../data/output/llm_extract_output_20240706095040_100_sample_label_studio.json'

fp = PERSIST_FP
with open(fp, 'w') as f:
    json.dump(converted_data, f, indent=2)