In [13]:
def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        data = file.read().strip().split('\n')  # Splitting data by double newlines

    formatted_data = []
    for record in data:
        text, entities = '', []
        lines = record.split('\n')
        start = 0  # Initialize start of the first token
        entity_start = None  # Initialize the start of an entity
        entity_type = None  # Initialize the entity type
        for line in lines:
            parts = line.split(',')
            if len(parts) >= 2:  # Ensure there's at least a token and a tag
                token, tag = parts[0], parts[1]
                # Update text with token followed by space
                text += token + ' '
                end = start + len(token)  # End index of the current token
                if tag.startswith('B-') or tag.startswith('I-'):  # Check if the tag indicates an entity
                    if tag.startswith('B-'):
                        # If there was a previous entity, append it
                        if entity_start is not None:
                            entities.append((entity_start, prev_end, entity_type))
                        # Start new entity
                        entity_start = start
                        entity_type = tag.split('-')[1]
                    if tag.startswith('I-') and entity_start is not None:
                        # Continue the entity
                        entity_type = tag.split('-')[1]  # Update the type if needed (typically not necessary)
                else:
                    # If ending part of entity, append it
                    if entity_start is not None:
                        entities.append((entity_start, end, entity_type))
                        entity_start = None  # Reset for the next entity

                # Prepare for next token
                start = end + 1
                prev_end = end  # Remember the end of the last token

        # Check for any trailing entity at the end
        if entity_start is not None:
            entities.append((entity_start, prev_end, entity_type))

        # Append the parsed data if there are entities
        if entities:
            formatted_data.append((text.strip(), {'entities': entities}))

    return formatted_data

# Load the data
train_data = load_data('data/ner_multi/splits/train_correct.csv')
dev_data = load_data('data/ner_multi/splits/dev_correct.csv')
test_data = load_data('data/ner_multi/splits/test_correct.csv')


In [14]:
train_data

[('Датування', {'entities': [(0, 9, 'dating_criteria')]}),
 ('Розташування', {'entities': [(0, 12, 'other')]}),
 ('Датування', {'entities': [(0, 9, 'dating_criteria')]}),
 ('Розташування', {'entities': [(0, 12, 'other')]}),
 ('Опис', {'entities': [(0, 4, 'inscription')]}),
 ('дворядковий', {'entities': [(0, 11, 'other')]}),
 ('Графіті', {'entities': [(0, 7, 'inscription')]}),
 ('Опис', {'entities': [(0, 4, 'inscription')]}),
 ('Графіті', {'entities': [(0, 7, 'inscription')]}),
 ('Датування', {'entities': [(0, 9, 'dating_criteria')]}),
 ('Опис', {'entities': [(0, 4, 'inscription')]}),
 ('Датування', {'entities': [(0, 9, 'dating_criteria')]}),
 ('Розташування', {'entities': [(0, 12, 'other')]}),
 ('Графіті', {'entities': [(0, 7, 'inscription')]}),
 ('Опис', {'entities': [(0, 4, 'inscription')]}),
 ('Розташування', {'entities': [(0, 12, 'other')]}),
 ('Опис', {'entities': [(0, 4, 'inscription')]}),
 ('Опис', {'entities': [(0, 4, 'inscription')]}),
 ('Опис', {'entities': [(0, 4, 'inscripti

In [15]:
import spacy
from spacy.training import Example

# https://towardsdatascience.com/train-ner-with-custom-training-data-using-spacy-525ce748fab7

# Load a blank model or a pre-existing model for your language
nlp = spacy.blank('uk')  # Replace 'en' with your language code if different

# Create the NER component if it doesn't exist
if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe('ner')
else:
    ner = nlp.get_pipe('ner')

# Add labels
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [16]:
from spacy.util import minibatch, compounding
import random
optimizer = nlp.initialize()

# Use the training data
for epoch in range(30):
    random.shuffle(train_data)
    losses = {}
    batches = minibatch(train_data, size=compounding(4., 32., 1.001))
    for batch in batches:
        for text, annotations in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)
    print(f"Losses at epoch {epoch}: {losses}")

Losses at epoch 0: {'ner': 236.734013967865}
Losses at epoch 1: {'ner': 118.33432681282022}
Losses at epoch 2: {'ner': 76.53188738454494}
Losses at epoch 3: {'ner': 80.07914452805569}
Losses at epoch 4: {'ner': 72.8257672192506}
Losses at epoch 5: {'ner': 50.235987624758295}
Losses at epoch 6: {'ner': 39.29924046200534}
Losses at epoch 7: {'ner': 42.235757430188066}
Losses at epoch 8: {'ner': 41.407843206172686}
Losses at epoch 9: {'ner': 38.4689023039409}
Losses at epoch 10: {'ner': 31.849915553522766}
Losses at epoch 11: {'ner': 31.65479047269599}
Losses at epoch 12: {'ner': 32.629562987694264}
Losses at epoch 13: {'ner': 43.742464259905695}
Losses at epoch 14: {'ner': 20.789991479564083}
Losses at epoch 15: {'ner': 30.319847194094404}
Losses at epoch 16: {'ner': 32.84856757641957}
Losses at epoch 17: {'ner': 24.63846966672788}
Losses at epoch 18: {'ner': 17.95776680643473}
Losses at epoch 19: {'ner': 27.33032722451968}
Losses at epoch 20: {'ner': 29.42680495190561}
Losses at epoch 2

In [17]:
def transform_to_iob(nlp_model, data):
    transformed_data = []
    for text, annot in data:
        doc = nlp_model(text)
        # Extract IOB tokens from the SpaCy doc
        iob_tokens = [f"{t.ent_iob_}-{t.ent_type_}" if t.ent_type_ else t.ent_iob_ for t in doc]
        transformed_data.append(iob_tokens)
    return transformed_data

predictions_iob = transform_to_iob(nlp, test_data)

In [18]:
predictions_iob[:3]

[['B-inscription'], ['B-dating_criteria'], ['B-dating_criteria']]

In [19]:
def transform_annotations_to_iob(data, nlp):
    iob_annotations = []

    for text, annot in data:
        doc = nlp.make_doc(text)  # Create a doc from text to ensure correct offsets
        tags = ['O'] * len(doc)  # Initialize tags with 'O'

        entities = annot['entities']
        # Convert character offsets to token offsets
        for start_char, end_char, label in entities:
            start_token, end_token = None, None
            for token_index, token in enumerate(doc):
                # Check if we've found the start token
                if token.idx == start_char:
                    start_token = token_index
                # Check if the current token extends beyond the start of the end_char range
                if token.idx + len(token) == end_char:
                    end_token = token_index
                # If the token encompasses the end character
                if token.idx + len(token) > end_char and end_token is None:
                    end_token = token_index

            if start_token is not None and end_token is not None:
                tags[start_token] = f"B-{label}"
                for i in range(start_token + 1, end_token + 1):
                    tags[i] = f"I-{label}"

        iob_annotations.append(tags)
    return iob_annotations

references_iob = transform_annotations_to_iob(test_data, nlp)

In [20]:
from seqeval.metrics import classification_report, accuracy_score, f1_score

# Assuming predictions_iob and references_iob are ready
results = classification_report(references_iob, predictions_iob, digits=4)
accuracy = accuracy_score(references_iob, predictions_iob)
f1 = f1_score(references_iob, predictions_iob)

print("Classification Report:\n", results)
print("Accuracy Score:", accuracy)
print("F1 Score:", f1 * 100.0, "%")

Classification Report:
                     precision    recall  f1-score   support

   dating_criteria     1.0000    1.0000    1.0000        33
        decoration     1.0000    1.0000    1.0000        16
  inscripiton_type     1.0000    1.0000    1.0000         1
       inscription     1.0000    1.0000    1.0000        76
  inscription_type     1.0000    1.0000    1.0000         8
          material     1.0000    1.0000    1.0000         2
          monument     0.0000    0.0000    0.0000         1
       object_type     0.8571    1.0000    0.9231         6
             other     1.0000    1.0000    1.0000        32
preservation_state     1.0000    1.0000    1.0000         1

         micro avg     0.9943    0.9943    0.9943       176
         macro avg     0.8857    0.9000    0.8923       176
      weighted avg     0.9894    0.9943    0.9917       176

Accuracy Score: 0.9943181818181818
F1 Score: 99.43181818181817 %
