# Set up

# Test

In [1]:
import spacy
from spacy.tokens import Doc
from spacy.training import Example

# Example SpaCy JSON data
spacy_json = {
    "text": "Barack Obama was born in Hawaii.",
    "ents": [
        {"start": 0, "end": 12, "label": "PERSON"},
        {"start": 25, "end": 31, "label": "GPE"}
    ]
}

# Load a SpaCy model
nlp = spacy.load("en_core_web_sm")


# Create a Doc object from the text
text = spacy_json['text']
doc = nlp(text)

# Initialize the IOB2 tags
iob2_tags = ["O"] * len(doc)

# Function to assign IOB2 tags
def assign_iob2_tags(doc, spacy_json):
    entities = spacy_json['ents']
    for ent in entities:
        start_char = ent['start']
        end_char = ent['end']
        label = ent['label']
        
        for token in doc:
            if token.idx >= start_char and token.idx < end_char:
                if token.idx == start_char:
                    iob2_tags[token.i] = f"B-{label}"
                else:
                    iob2_tags[token.i] = f"I-{label}"
    
    return iob2_tags

# Get the IOB2 tags
iob2_tags = assign_iob2_tags(doc, spacy_json)

# Print the results
for token, tag in zip(doc, iob2_tags):
    print(f"{token.text}\t{tag}")

Barack	B-PERSON
Obama	I-PERSON
was	O
born	O
in	O
Hawaii	B-GPE
.	O


## With Reviews data

In [2]:
import json

def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Example usage
spacy_formatted_persist_path = '../data/sampled_results_openai_spacy_format.jsonl'
spacy_fmt_data = load_jsonl(spacy_formatted_persist_path)

In [3]:
spacy_fmt_data[:1]

[{'id': '339593553392192228390882582566160737164',
  'text': 'The place was nice and calm.',
  'label': [[4, 27, 'AMBIENCE']],
  'Comments': []}]

### Convert

In [4]:
import spacy

# Load a SpaCy model
nlp = spacy.load("en_core_web_sm")

def spacy_json_to_iob2(data):
    results = []
    for item in data:
        text = item['text']
        labels = item['label']
        
        # Create a SpaCy Doc object
        doc = nlp(text)
        
        # Initialize IOB2 tags
        iob2_tags = ["O"] * len(doc)
        
        # Assign IOB2 tags based on the labels
        for label in labels:
            start_char = label[0]
            end_char = label[1]
            entity_label = label[2]
            
            for token in doc:
                if token.idx >= start_char and token.idx < end_char:
                    if token.idx == start_char:
                        iob2_tags[token.i] = f"B-{entity_label}"
                    else:
                        iob2_tags[token.i] = f"I-{entity_label}"
        
        # Collect the tokens and their IOB2 tags
        token_tags = [(token.text, tag) for token, tag in zip(doc, iob2_tags)]
        results.append(token_tags)
    
    return results

iob2_results = spacy_json_to_iob2(spacy_fmt_data)

In [5]:
iob2_results

[[('The', 'O'),
  ('place', 'B-AMBIENCE'),
  ('was', 'I-AMBIENCE'),
  ('nice', 'I-AMBIENCE'),
  ('and', 'I-AMBIENCE'),
  ('calm', 'I-AMBIENCE'),
  ('.', 'O')],
 [('Their', 'O'),
  ('sake', 'B-FOOD'),
  ('martini', 'I-FOOD'),
  ('is', 'I-FOOD'),
  ('wonderful', 'I-FOOD'),
  ('.', 'O')],
 [('Great', 'O'),
  ('for', 'O'),
  ('groups', 'O'),
  (',', 'O'),
  ('great', 'O'),
  ('for', 'O'),
  ('a', 'O'),
  ('date', 'O'),
  (',', 'O'),
  ('great', 'O'),
  ('for', 'O'),
  ('early', 'O'),
  ('brunch', 'O'),
  ('or', 'O'),
  ('a', 'O'),
  ('nightcap', 'O'),
  ('.', 'O')],
 [('i', 'O'),
  ('recommend', 'B-FOOD'),
  ('the', 'I-FOOD'),
  ('thai', 'I-FOOD'),
  ('popcorn', 'I-FOOD'),
  (':)', 'O')],
 [('Most', 'O'),
  ('of', 'O'),
  ('the', 'O'),
  ('servers', 'B-SERVICE'),
  ('are', 'I-SERVICE'),
  ('very', 'I-SERVICE'),
  ('attentive', 'I-SERVICE'),
  (',', 'I-SERVICE'),
  ('friendly', 'I-SERVICE'),
  ('and', 'I-SERVICE'),
  ('quite', 'I-SERVICE'),
  ('attractive', 'I-SERVICE'),
  ('.', 'O')],
 [('

### Add metadata

In [6]:
assert len(iob2_results) == len(spacy_fmt_data)

In [41]:
outputs = []
for i in range(len(iob2_results)):
    tokens, tags = zip(*iob2_results[i])
    output = {
        **{k: v for k, v in spacy_fmt_data[i].items() if k != 'label'},
        "tokens": tokens,
        "ner_tags": tags,
    }
    outputs.append(output)

In [42]:
outputs[:5]

[{'id': '339593553392192228390882582566160737164',
  'text': 'The place was nice and calm.',
  'Comments': [],
  'tokens': ('The', 'place', 'was', 'nice', 'and', 'calm', '.'),
  'ner_tags': ('O',
   'B-AMBIENCE',
   'I-AMBIENCE',
   'I-AMBIENCE',
   'I-AMBIENCE',
   'I-AMBIENCE',
   'O')},
 {'id': '114101779017082109603328579450586119271',
  'text': 'Their sake martini is wonderful.',
  'Comments': [],
  'tokens': ('Their', 'sake', 'martini', 'is', 'wonderful', '.'),
  'ner_tags': ('O', 'B-FOOD', 'I-FOOD', 'I-FOOD', 'I-FOOD', 'O')},
 {'id': '92175068642289365519888727785080877606',
  'text': 'Great for groups, great for a date, great for early brunch or a nightcap.',
  'Comments': [],
  'tokens': ('Great',
   'for',
   'groups',
   ',',
   'great',
   'for',
   'a',
   'date',
   ',',
   'great',
   'for',
   'early',
   'brunch',
   'or',
   'a',
   'nightcap',
   '.'),
  'ner_tags': ('O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',


## Convert tags to int

In [43]:
ner_tags_set = set()
for record in outputs:
    ner_tags_set.update(list(record['ner_tags']))

In [44]:
ner_tags_set.remove('O')

In [45]:
def key_sort_ner_tags(tag):
    """Convert from B-FOOD to FOOD-B, I-FOOD to FOOD-I
    """
    pre, suf = tag.split('-')
    return f"{suf}-{pre}"

ner_tags_label = sorted(list(ner_tags_set), key=key_sort_ner_tags)
ner_tags_label.insert(0, 'O')
ner_tags_label_mapper = {i: v for i, v in enumerate(ner_tags_label)}
ner_tags_label_reverse_mapper = {v: i for i, v in enumerate(ner_tags_label)}

print(f"{ner_tags_label_mapper=}")
print(f"{ner_tags_label_reverse_mapper=}")

ner_tags_label_mapper={0: 'O', 1: 'B-AMBIENCE', 2: 'I-AMBIENCE', 3: 'B-FOOD', 4: 'I-FOOD', 5: 'B-LOCATION', 6: 'I-LOCATION', 7: 'B-PRICE', 8: 'I-PRICE', 9: 'B-SERVICE', 10: 'I-SERVICE'}
ner_tags_label_reverse_mapper={'O': 0, 'B-AMBIENCE': 1, 'I-AMBIENCE': 2, 'B-FOOD': 3, 'I-FOOD': 4, 'B-LOCATION': 5, 'I-LOCATION': 6, 'B-PRICE': 7, 'I-PRICE': 8, 'B-SERVICE': 9, 'I-SERVICE': 10}


In [50]:
for output in outputs:
    new_ner_tags = []
    for t in output['ner_tags']:
        new_ner_tags.append(ner_tags_label_reverse_mapper[t])
    output['ner_tags'] = new_ner_tags

outputs[:5]

[{'id': '339593553392192228390882582566160737164',
  'text': 'The place was nice and calm.',
  'Comments': [],
  'tokens': ('The', 'place', 'was', 'nice', 'and', 'calm', '.'),
  'ner_tags': [0, 1, 2, 2, 2, 2, 0]},
 {'id': '114101779017082109603328579450586119271',
  'text': 'Their sake martini is wonderful.',
  'Comments': [],
  'tokens': ('Their', 'sake', 'martini', 'is', 'wonderful', '.'),
  'ner_tags': [0, 3, 4, 4, 4, 0]},
 {'id': '92175068642289365519888727785080877606',
  'text': 'Great for groups, great for a date, great for early brunch or a nightcap.',
  'Comments': [],
  'tokens': ('Great',
   'for',
   'groups',
   ',',
   'great',
   'for',
   'a',
   'date',
   ',',
   'great',
   'for',
   'early',
   'brunch',
   'or',
   'a',
   'nightcap',
   '.'),
  'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]},
 {'id': '275534048062756000231583449127142411586',
  'text': 'i recommend the thai popcorn :)',
  'Comments': [],
  'tokens': ('i', 'recommend', 'the', 'thai

#### Persist

In [51]:
# Split to train and validation
from sklearn.model_selection import train_test_split

train_outputs, val_outputs = train_test_split(outputs, test_size=0.2, random_state=42)

print(f"{len(train_outputs)=}, {len(val_outputs)=}")

len(train_outputs)=24, len(val_outputs)=6


In [52]:
def save_to_jsonl(data, filename):
    with open(filename, 'w') as f:
        for item in data:
            json_line = json.dumps(item)
            f.write(json_line + '\n')

In [53]:
iob2_formatted_persist_path = '../data/sampled_results_openai_iob2_format_{split}.jsonl'
iob2_train = train_outputs[:]
iob2_val = val_outputs[:]

save_to_jsonl(iob2_train, iob2_formatted_persist_path.format(split='train'))
save_to_jsonl(iob2_val, iob2_formatted_persist_path.format(split='val'))

# Push dataset to HuggingFace Datasets

## Add Dataset features

In [76]:
from datasets import Features, ClassLabel, Sequence, Value

In [81]:
ner_feature = Sequence(feature=ClassLabel(num_classes=len(ner_tags_label), names=ner_tags_label, id=None))

features = Features(
    {
        'id': Value('string'),
        'text': Value('large_string'),
        'Comments': Sequence(Value('string')),
        'tokens': Sequence(Value('string')),
        'ner_tags': ner_feature
    }
)

In [82]:
from datasets import load_dataset, Dataset

# Example of loading a CSV file
data_files = {'train': iob2_formatted_persist_path.format(split='train'), 'val': iob2_formatted_persist_path.format(split='val')}
dataset = load_dataset('json', data_files=data_files, features=features)

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

In [83]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'Comments', 'tokens', 'ner_tags'],
        num_rows: 24
    })
    val: Dataset({
        features: ['id', 'text', 'Comments', 'tokens', 'ner_tags'],
        num_rows: 6
    })
})

## Push

In [84]:
from dotenv import load_dotenv

load_dotenv()

True

In [85]:
!huggingface-cli login --token $HUGGINGFACE_WRITE_TOKEN --add-to-git-credential

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/dvquys/.cache/huggingface/token
Login successful


In [86]:
HUGGINGFACE_DS = 'dvquys/restaurant-reviews-public-sources'
dataset.push_to_hub(HUGGINGFACE_DS, commit_message='Register Features')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/503 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/dvquys/restaurant-reviews-public-sources/commit/f81a1e8cd7e5d7a7b65df4db63c5df71dc021c3a', commit_message='Register Features', commit_description='', oid='f81a1e8cd7e5d7a7b65df4db63c5df71dc021c3a', pr_url=None, pr_revision=None, pr_num=None)