In [1]:
# # Correcting the initial generated file


# def process_file(input_filename, output_filename):
#     with open(input_filename, 'r', encoding='utf-8') as file:
#         lines = file.readlines()

#     with open(output_filename, 'w', encoding='utf-8') as file:
#         for line in lines:
#             if line.strip() == '"':
#                 file.write("\n")  # write a newline for lines that are just a quote
#             elif line.strip() == '",O':
#                 continue
#             else:
#                 corrected_line = line.replace(',', '\t')
#                 file.write(corrected_line)

# process_file('data/ner/tokens_IOB.csv', 'data/ner/tokens_IOB_correct.csv')

In [23]:
import random

def read_and_split_data(input_filename, train_ratio=0.8, dev_ratio=0.1, test_ratio=0.1):
    # Ensure the ratios sum to 1
    assert train_ratio + dev_ratio + test_ratio == 1
    
    # Read and process the file
    with open(input_filename, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Split content into sentences based on newlines
    sentences = content.strip().split('\n\n')

    # Shuffle the sentences randomly for a fair split
    random.shuffle(sentences)

    # Calculate split indices
    total = len(sentences)
    train_end = int(total * train_ratio)
    dev_end = train_end + int(total * dev_ratio)
    
    # Create splits
    train_sentences = sentences[:train_end]
    dev_sentences = sentences[train_end:dev_end]
    test_sentences = sentences[dev_end:]

    # Save each split to a different file
    with open('data/final/splits/train.csv', 'w', encoding='utf-8') as file:
        file.write('\n\n'.join(train_sentences))
    with open('data/final/splits/dev.csv', 'w', encoding='utf-8') as file:
        file.write('\n\n'.join(dev_sentences))
    with open('data/final/splits/test.csv', 'w', encoding='utf-8') as file:
        file.write('\n\n'.join(test_sentences))

read_and_split_data('data/final/data.tsv')

In [24]:
def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        data = file.read().strip().split('\n\n')  # Splitting data by double newlines

    formatted_data = []
    for record in data:
        text, entities = '', []
        lines = record.split('\n')
        start = 0  # Initialize start of the first token
        entity_start = None  # Initialize the start of an entity
        entity_type = None  # Initialize the entity type
        for line in lines:
            parts = line.split('\t')
            if len(parts) >= 2:  # Ensure there's at least a token and a tag
                token, tag = parts[0], parts[1]
                # Update text with token followed by space
                text += token + ' '
                end = start + len(token)  # End index of the current token
                if tag.startswith('B-') or tag.startswith('I-'):  # Check if the tag indicates an entity
                    if tag.startswith('B-'):
                        # If there was a previous entity, append it
                        if entity_start is not None:
                            entities.append((entity_start, prev_end, entity_type))
                        # Start new entity
                        entity_start = start
                        entity_type = tag.split('-')[1]
                    if tag.startswith('I-') and entity_start is not None:
                        # Continue the entity
                        entity_type = tag.split('-')[1]  # Update the type if needed (typically not necessary)
                else:
                    # If ending part of entity, append it
                    if entity_start is not None:
                        entities.append((entity_start, end, entity_type))
                        entity_start = None  # Reset for the next entity

                # Prepare for next token
                start = end + 1
                prev_end = end  # Remember the end of the last token

        # Check for any trailing entity at the end
        if entity_start is not None:
            entities.append((entity_start, prev_end, entity_type))

        # Append the parsed data if there are entities
        if entities:
            formatted_data.append((text.strip(), {'entities': entities}))

    return formatted_data

# Load the data
train_data = load_data('data/final/splits/train.csv')
dev_data = load_data('data/final/splits/dev.csv')
test_data = load_data('data/final/splits/test.csv')


In [25]:
train_data

[('Графіті № 586 ( табл .', {'entities': [(0, 9, 'inscription')]}),
 ('Висота від сучасної підлоги 54 см , відстань від лівого кута 51 см . Опис .',
  {'entities': [(69, 75, 'inscription')]}),
 ('Графіті № 503 ( табл . LХІІ , 3 ) Публікації .',
  {'entities': [(0, 9, 'inscription')]}),
 ('Датування .', {'entities': [(0, 11, 'dating_criteria')]}),
 ('Молитовний напис . Графіті № 501 ( табл .',
  {'entities': [(11, 18, 'inscription'), (19, 28, 'inscription')]}),
 ('Висота від сучасної підлоги 55 см , відстань від лівого кута 64 см . Опис .',
  {'entities': [(69, 75, 'inscription')]}),
 ('Датування .', {'entities': [(0, 11, 'dating_criteria')]}),
 ('На фресці прокреслений чотириконечний хрест , основа щогли якого спи- рається на відкриту донизу одноступінчасту Голгофу .',
  {'entities': [(10, 22, 'execution_technique'),
    (23, 45, 'decoration'),
    (46, 64, 'object_type')]}),
 ('Опис .', {'entities': [(0, 6, 'inscription')]}),
 ('Графіті № 543 ( табл .', {'entities': [(0, 9, 'inscripti

# Prepare SpaCy NER model

In [26]:
import spacy
from spacy.training import Example

# https://towardsdatascience.com/train-ner-with-custom-training-data-using-spacy-525ce748fab7

# Load a blank model or a pre-existing model for your language
nlp = spacy.blank('uk')  # Replace 'en' with your language code if different

# Create the NER component if it doesn't exist
if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe('ner')
else:
    ner = nlp.get_pipe('ner')

# Add labels
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])


# Train a SpaCy NER model

In [27]:
from spacy.util import minibatch, compounding

optimizer = nlp.initialize()

# Use the training data
for epoch in range(25):
    random.shuffle(train_data)
    losses = {}
    batches = minibatch(train_data, size=compounding(4., 32., 1.001))
    for batch in batches:
        for text, annotations in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)
    print(f"Losses at epoch {epoch}: {losses}")


Losses at epoch 0: {'ner': 1127.8061195828202}
Losses at epoch 1: {'ner': 442.0105688380212}
Losses at epoch 2: {'ner': 421.4218361339304}
Losses at epoch 3: {'ner': 435.41170710475456}
Losses at epoch 4: {'ner': 367.6602700794566}
Losses at epoch 5: {'ner': 327.9406950034582}
Losses at epoch 6: {'ner': 257.15945962447097}
Losses at epoch 7: {'ner': 353.95736190309486}
Losses at epoch 8: {'ner': 264.4722774302612}
Losses at epoch 9: {'ner': 269.674714384915}
Losses at epoch 10: {'ner': 215.7394853181311}
Losses at epoch 11: {'ner': 204.718010064536}
Losses at epoch 12: {'ner': 213.68417723186192}
Losses at epoch 13: {'ner': 177.83682211562513}
Losses at epoch 14: {'ner': 171.88516615676912}
Losses at epoch 15: {'ner': 165.11581794200958}
Losses at epoch 16: {'ner': 145.93355866586575}
Losses at epoch 17: {'ner': 177.99353892080586}
Losses at epoch 18: {'ner': 143.5537956684429}
Losses at epoch 19: {'ner': 162.1177910293382}
Losses at epoch 20: {'ner': 192.0279291069965}
Losses at epoch

# Evaluation on the test data

## Prepare test data predictions

In [28]:
def transform_to_iob(nlp_model, data):
    transformed_data = []
    for text, annot in data:
        doc = nlp_model(text)
        # Extract IOB tokens from the SpaCy doc
        iob_tokens = [f"{t.ent_iob_}-{t.ent_type_}" if t.ent_type_ else t.ent_iob_ for t in doc]
        transformed_data.append(iob_tokens)
    return transformed_data

predictions_iob = transform_to_iob(nlp, test_data)


In [29]:
predictions_iob[:3]

[['B-inscription',
  'I-inscription',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-object_type',
  'I-object_type',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-inscription',
  'I-inscription',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'B-inscription',
  'I-inscription',
  'B-inscription',
  'I-inscription',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-dating_criteria', 'I-dating_criteria']]

## Prepare the true test data (groundtruth)

In [30]:
def transform_annotations_to_iob(data, nlp):
    iob_annotations = []

    for text, annot in data:
        doc = nlp.make_doc(text)  # Create a doc from text to ensure correct offsets
        tags = ['O'] * len(doc)  # Initialize tags with 'O'

        entities = annot['entities']
        # Convert character offsets to token offsets
        for start_char, end_char, label in entities:
            start_token, end_token = None, None
            for token_index, token in enumerate(doc):
                # Check if we've found the start token
                if token.idx == start_char:
                    start_token = token_index
                # Check if the current token extends beyond the start of the end_char range
                if token.idx + len(token) == end_char:
                    end_token = token_index
                # If the token encompasses the end character
                if token.idx + len(token) > end_char and end_token is None:
                    end_token = token_index

            if start_token is not None and end_token is not None:
                tags[start_token] = f"B-{label}"
                for i in range(start_token + 1, end_token + 1):
                    tags[i] = f"I-{label}"

        iob_annotations.append(tags)
    return iob_annotations

references_iob = transform_annotations_to_iob(test_data, nlp)

## Results evaluation

In [31]:
# !pip install seqeval

In [32]:
from seqeval.metrics import classification_report, accuracy_score, f1_score

# Assuming predictions_iob and references_iob are ready
results = classification_report(references_iob, predictions_iob, digits=4)
accuracy = accuracy_score(references_iob, predictions_iob)
f1 = f1_score(references_iob, predictions_iob)

print("Classification Report:\n", results)
print("Accuracy Score:", accuracy)
print("F1 Score:", f1 * 100.0, "%")

Classification Report:
                      precision    recall  f1-score   support

    dating_criteria     1.0000    1.0000    1.0000        37
         decoration     1.0000    0.9000    0.9474        10
execution_technique     0.8000    0.8889    0.8421         9
        inscription     0.9901    0.9901    0.9901       101
           material     1.0000    1.0000    1.0000         1
           monument     0.0000    0.0000    0.0000         2
        object_type     0.8571    0.8571    0.8571         7
              other     1.0000    0.9722    0.9859        36
             symbol     0.0000    0.0000    0.0000         1

          micro avg     0.9800    0.9608    0.9703       204
          macro avg     0.7386    0.7343    0.7358       204
       weighted avg     0.9667    0.9608    0.9635       204

Accuracy Score: 0.9930209371884346
F1 Score: 97.02970297029701 %


  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
model_path = "model/first_model"
nlp.to_disk(model_path)

# Example - how to use it

In [15]:
import spacy
model_path = 'model/first_model'
nlp = spacy.load(model_path)

In [16]:
text = "мурування апсиди ХІІ ст. г давні ХІ ст. пройми арок галереї закла-."
doc = nlp(text)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities [('мурування апсиди', 'object_type')]


In [22]:
for text, _ in test_data[:5]:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc if 'inscription' in t.ent_type_])

Entities [('прокреслений малюнок шестикінечної', 'decoration'), ('Датування .', 'dating_criteria')]
Tokens []
Entities [('Розташування .', 'other')]
Tokens []
Entities [('Опис .', 'inscription')]
Tokens [('Опис', 'inscription', 3), ('.', 'inscription', 1)]
Entities [('Розташування .', 'other')]
Tokens []
Entities [('Розташування .', 'other')]
Tokens []
