In [1]:
# # Correcting the initial generated file


# def process_file(input_filename, output_filename):
#     with open(input_filename, 'r', encoding='utf-8') as file:
#         lines = file.readlines()

#     with open(output_filename, 'w', encoding='utf-8') as file:
#         for line in lines:
#             if line.strip() == '"':
#                 file.write("\n")  # write a newline for lines that are just a quote
#             elif line.strip() == '",O':
#                 continue
#             else:
#                 corrected_line = line.replace(',', '\t')
#                 file.write(corrected_line)

# process_file('data/ner/tokens_IOB.csv', 'data/ner/tokens_IOB_correct.csv')

In [2]:
import random

def read_and_split_data(input_filename, train_ratio=0.8, dev_ratio=0.1, test_ratio=0.1):
    # Ensure the ratios sum to 1
    assert train_ratio + dev_ratio + test_ratio == 1
    
    # Read and process the file
    with open(input_filename, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Split content into sentences based on newlines
    sentences = content.strip().split('\n\n')

    # Shuffle the sentences randomly for a fair split
    random.shuffle(sentences)

    # Calculate split indices
    total = len(sentences)
    train_end = int(total * train_ratio)
    dev_end = train_end + int(total * dev_ratio)
    
    # Create splits
    train_sentences = sentences[:train_end]
    dev_sentences = sentences[train_end:dev_end]
    test_sentences = sentences[dev_end:]

    # Save each split to a different file
    with open('data/final/splits/train.csv', 'w', encoding='utf-8') as file:
        file.write('\n\n'.join(train_sentences))
    with open('data/final/splits/dev.csv', 'w', encoding='utf-8') as file:
        file.write('\n\n'.join(dev_sentences))
    with open('data/final/splits/test.csv', 'w', encoding='utf-8') as file:
        file.write('\n\n'.join(test_sentences))

read_and_split_data('data/final/data.tsv')

In [3]:
def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        data = file.read().strip().split('\n\n')  # Splitting data by double newlines

    formatted_data = []
    for record in data:
        text, entities = '', []
        lines = record.split('\n')
        start = 0  # Initialize start of the first token
        entity_start = None  # Initialize the start of an entity
        entity_type = None  # Initialize the entity type
        for line in lines:
            parts = line.split('\t')
            if len(parts) >= 2:  # Ensure there's at least a token and a tag
                token, tag = parts[0], parts[1]
                # Update text with token followed by space
                text += token + ' '
                end = start + len(token)  # End index of the current token
                if tag.startswith('B-') or tag.startswith('I-'):  # Check if the tag indicates an entity
                    if tag.startswith('B-'):
                        # If there was a previous entity, append it
                        if entity_start is not None:
                            entities.append((entity_start, prev_end, entity_type))
                        # Start new entity
                        entity_start = start
                        entity_type = tag.split('-')[1]
                    if tag.startswith('I-') and entity_start is not None:
                        # Continue the entity
                        entity_type = tag.split('-')[1]  # Update the type if needed (typically not necessary)
                else:
                    # If ending part of entity, append it
                    if entity_start is not None:
                        entities.append((entity_start, end, entity_type))
                        entity_start = None  # Reset for the next entity

                # Prepare for next token
                start = end + 1
                prev_end = end  # Remember the end of the last token

        # Check for any trailing entity at the end
        if entity_start is not None:
            entities.append((entity_start, prev_end, entity_type))

        # Append the parsed data if there are entities
        if entities:
            formatted_data.append((text.strip(), {'entities': entities}))

    return formatted_data

# Load the data
train_data = load_data('data/final/splits/train.csv')
dev_data = load_data('data/final/splits/dev.csv')
test_data = load_data('data/final/splits/test.csv')


In [None]:
train_data

[('Висота від сучасної підлоги 133 см , відстань від лівого кута Опис .',
  {'entities': [(62, 68, 'inscription')]}),
 ('ХІІ , 2 ) Розташування .', {'entities': [(10, 24, 'other')]}),
 ("Пам'ятний напис . Графіті № 4298 ( табл . СІ , 3 ) Розташування .",
  {'entities': [(10, 17, 'inscription'),
    (18, 27, 'inscription'),
    (51, 65, 'other')]}),
 ('Висота від основи 107 см . Опис .',
  {'entities': [(27, 33, 'inscription')]}),
 ("Автор напису походив з родини Доманських ( герб Ларисса)196 , особа з таким ім'ям згадується наприкінці XVI ст.197 , що не суперечить датуванню графіті .",
  {'entities': [(144, 153, 'inscription')]}),
 ('Молитовний напис . Графіті № 224 ( табл . LХ , 1 ) Публікації .',
  {'entities': [(11, 18, 'inscription'), (19, 28, 'inscription')]}),
 ('Молитовний напис . Графіті № 588 ( табл . ХLII , 1 ) Розташування .',
  {'entities': [(11, 18, 'inscription'),
    (19, 28, 'inscription'),
    (53, 67, 'other')]}),
 ('Розташування .', {'entities': [(0, 14, 'other')]}),

# Prepare SpaCy NER model

In [5]:
import spacy
from spacy.training import Example

# https://towardsdatascience.com/train-ner-with-custom-training-data-using-spacy-525ce748fab7

# Load a blank model or a pre-existing model for your language
nlp = spacy.blank('uk')  # Replace 'en' with your language code if different

# Create the NER component if it doesn't exist
if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe('ner')
else:
    ner = nlp.get_pipe('ner')

# Add labels
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])


# Train a SpaCy NER model

In [6]:
from spacy.util import minibatch, compounding

optimizer = nlp.initialize()

# Use the training data
for epoch in range(25):
    random.shuffle(train_data)
    losses = {}
    batches = minibatch(train_data, size=compounding(4., 32., 1.001))
    for batch in batches:
        for text, annotations in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)
    print(f"Losses at epoch {epoch}: {losses}")


Losses at epoch 0: {'ner': 1122.94405920648}
Losses at epoch 1: {'ner': 527.1068041042239}
Losses at epoch 2: {'ner': 537.4914999522258}


KeyboardInterrupt: 

# Evaluation on the test data

## Prepare test data predictions

In [7]:
def transform_to_iob(nlp_model, data):
    transformed_data = []
    for text, annot in data:
        doc = nlp_model(text)
        # Extract IOB tokens from the SpaCy doc
        iob_tokens = [f"{t.ent_iob_}-{t.ent_type_}" if t.ent_type_ else t.ent_iob_ for t in doc]
        transformed_data.append(iob_tokens)
    return transformed_data

predictions_iob = transform_to_iob(nlp, test_data)


In [8]:
predictions_iob[:3]

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-dating_criteria',
  'I-dating_criteria'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-inscription',
  'I-inscription'],
 ['B-other', 'I-other']]

## Prepare the true test data (groundtruth)

In [9]:
def transform_annotations_to_iob(data, nlp):
    iob_annotations = []

    for text, annot in data:
        doc = nlp.make_doc(text)  # Create a doc from text to ensure correct offsets
        tags = ['O'] * len(doc)  # Initialize tags with 'O'

        entities = annot['entities']
        # Convert character offsets to token offsets
        for start_char, end_char, label in entities:
            start_token, end_token = None, None
            for token_index, token in enumerate(doc):
                # Check if we've found the start token
                if token.idx == start_char:
                    start_token = token_index
                # Check if the current token extends beyond the start of the end_char range
                if token.idx + len(token) == end_char:
                    end_token = token_index
                # If the token encompasses the end character
                if token.idx + len(token) > end_char and end_token is None:
                    end_token = token_index

            if start_token is not None and end_token is not None:
                tags[start_token] = f"B-{label}"
                for i in range(start_token + 1, end_token + 1):
                    tags[i] = f"I-{label}"

        iob_annotations.append(tags)
    return iob_annotations

references_iob = transform_annotations_to_iob(test_data, nlp)

## Results evaluation

In [10]:
# !pip install seqeval

In [11]:
from seqeval.metrics import classification_report, accuracy_score, f1_score

# Assuming predictions_iob and references_iob are ready
results = classification_report(references_iob, predictions_iob, digits=4)
accuracy = accuracy_score(references_iob, predictions_iob)
f1 = f1_score(references_iob, predictions_iob)

print("Classification Report:\n", results)
print("Accuracy Score:", accuracy)
print("F1 Score:", f1 * 100.0, "%")

Classification Report:
                       precision    recall  f1-score   support

     dating_criteria     1.0000    1.0000    1.0000        42
          decoration     0.8000    0.8000    0.8000         5
epigraphic_shorthand     0.0000    0.0000    0.0000         1
 execution_technique     0.8000    0.8000    0.8000         5
         inscription     1.0000    1.0000    1.0000        93
    inscription_type     0.0000    0.0000    0.0000         1
            monument     0.0000    0.0000    0.0000         1
         object_type     1.0000    1.0000    1.0000         6
               other     0.9762    0.9762    0.9762        42
  preservation_state     0.0000    0.0000    0.0000         1

           micro avg     0.9845    0.9645    0.9744       197
           macro avg     0.5576    0.5576    0.5576       197
        weighted avg     0.9645    0.9645    0.9645       197

Accuracy Score: 0.9915407854984895
F1 Score: 97.43589743589743 %


  _warn_prf(average, modifier, msg_start, len(result))


# Example - how to use it

In [12]:
text = "мурування апсиди ХІІ ст. г давні ХІ ст. пройми арок галереї закла-."
doc = nlp(text)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities [('мурування апсиди', 'object_type')]


In [13]:
for text, _ in test_data[:5]:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc if 'inscription' in t.ent_type_])

Entities [('Датування .', 'dating_criteria')]
Tokens []
Entities [('Опис .', 'inscription')]
Tokens [('Опис', 'inscription', 3), ('.', 'inscription', 1)]
Entities [('Розташування .', 'other')]
Tokens []
Entities [('прокреслений', 'execution_technique'), ('чотириконечний хрест з', 'decoration'), ('Датування .', 'dating_criteria')]
Tokens []
Entities [('Датування .', 'dating_criteria')]
Tokens []
