# Set up

In [1]:
import json

# Load data

In [2]:
sample_persist_path = '../data/sampled_results_openai.json'
with open(sample_persist_path, 'r') as f:
    raw = json.load(f)

In [3]:
raw

[{'text': 'The place was nice and calm.',
  'entities': [['place was nice and calm', 'AMBIENCE']]},
 {'text': 'Their sake martini is wonderful.',
  'entities': [['sake martini is wonderful', 'FOOD']]},
 {'text': 'Great for groups, great for a date, great for early brunch or a nightcap.',
  'entities': []},
 {'text': 'i recommend the thai popcorn :)',
  'entities': [['recommend the thai popcorn', 'FOOD']]},
 {'text': 'Most of the servers are very attentive, friendly and quite attractive.',
  'entities': [['servers are very attentive, friendly and quite attractive',
    'SERVICE']]},
 {'text': 'My boyfriend had the New England Chowder it was good but I think the award should go to the Lobster Bisque.',
  'entities': [['New England Chowder it was good', 'FOOD'],
   ['award should go to the Lobster Bisque', 'FOOD']]},
 {'text': 'They are not helpful in the least and will give you the grand run around so by the time the event date rolls around you will not only regret chosing this place, bu

# Convert to SpaCy JSON format

In [4]:
import hashlib

def deterministic_hash(text):
    return int(hashlib.md5(text.encode()).hexdigest(), 16)

def convert_to_spacy_ner_format(data):
    spacy_data = []
    for item in data:
        text = item['text']
        entities = item['entities']
        labels = []
        for entity in entities:
            entity_text, entity_label = entity
            start_idx = text.find(entity_text)
            end_idx = start_idx + len(entity_text)
            if start_idx != -1:
                labels.append([start_idx, end_idx, entity_label])
        spacy_item = {
            "id": str(deterministic_hash(text)),
            "text": text,
            "label": labels,
            "Comments": []
        }
        spacy_data.append(spacy_item)
    return spacy_data

# Example input
data = [
    {'text': 'The place was nice and calm.', 'entities': [['place was nice and calm', 'AMBIENCE']]},
    {'text': 'Their sake martini is wonderful.', 'entities': [['sake martini is wonderful', 'FOOD']]}
]

# Convert data
spacy_data = convert_to_spacy_ner_format(data)

# Print the converted data
print(spacy_data)

[{'id': '339593553392192228390882582566160737164', 'text': 'The place was nice and calm.', 'label': [[4, 27, 'AMBIENCE']], 'Comments': []}, {'id': '114101779017082109603328579450586119271', 'text': 'Their sake martini is wonderful.', 'label': [[6, 31, 'FOOD']], 'Comments': []}]


In [5]:
converted = convert_to_spacy_ner_format(raw)

In [6]:
converted

[{'id': '339593553392192228390882582566160737164',
  'text': 'The place was nice and calm.',
  'label': [[4, 27, 'AMBIENCE']],
  'Comments': []},
 {'id': '114101779017082109603328579450586119271',
  'text': 'Their sake martini is wonderful.',
  'label': [[6, 31, 'FOOD']],
  'Comments': []},
 {'id': '92175068642289365519888727785080877606',
  'text': 'Great for groups, great for a date, great for early brunch or a nightcap.',
  'label': [],
  'Comments': []},
 {'id': '275534048062756000231583449127142411586',
  'text': 'i recommend the thai popcorn :)',
  'label': [[2, 28, 'FOOD']],
  'Comments': []},
 {'id': '180532750388022609210582093855641757291',
  'text': 'Most of the servers are very attentive, friendly and quite attractive.',
  'label': [[12, 69, 'SERVICE']],
  'Comments': []},
 {'id': '29354025005625774664623066330713768130',
  'text': 'My boyfriend had the New England Chowder it was good but I think the award should go to the Lobster Bisque.',
  'label': [[21, 52, 'FOOD'], [69

# Parsed

In [7]:
def save_to_jsonl(data, filename):
    with open(filename, 'w') as f:
        for item in data:
            json_line = json.dumps(item)
            f.write(json_line + '\n')

In [8]:
spacy_formatted_persist_path = '../data/sampled_results_openai_spacy_format.jsonl'
save_to_jsonl(converted, spacy_formatted_persist_path)