In [144]:
from pathlib import Path
import re
from seqeval.metrics import classification_report as seqeval_classification_report
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Start with the same Functions used to train the model. 
## The test data has to be pre processed the same way as the input Data used for Training the model

In [184]:
def read_conll(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            if line.strip() == "":  # Skip empty lines
                continue
            parts = line.split()
            if len(parts) >= 3:  # Ensure there are enough parts
                token = parts[0]  # First part is the token
                tag = parts[-1]  # Last part is the tag
                if tag.startswith("B-") or tag.startswith("I-") or tag == "O":  # Filter valid tags
                    tokens.append(token)
                    tags.append(tag)
        if tokens and tags:  # Ensure non-empty before appending
            token_docs.append(tokens)
            tag_docs.append(tags)

    return token_docs, tag_docs

In [185]:
def encode_tags(tags, encodings):
    encoded_labels = []
    for doc_labels, doc_offset in zip(tags, encodings.offset_mapping):
        # Create an array to hold the labels for each token in the tokenized input
        doc_enc_labels = np.ones(len(doc_offset), dtype=int) * -100
        
        # Track the current position in the original labels list
        label_idx = 0
        for idx, offset in enumerate(doc_offset):
            # Check if this token is the start of a new original token
            if offset[0] == 0 and label_idx < len(doc_labels):
                # Assign the label of the original token to the first subtoken
                doc_enc_labels[idx] = tag2id[doc_labels[label_idx]]
                label_idx += 1
            # For subtokens that are not the start of a new original token, the label remains -100
        
        encoded_labels.append(doc_enc_labels.tolist())
    
    return encoded_labels

In [186]:
def evaluate_model(model, test_dataset):
    model.eval()
    true_labels = []
    predicted_labels = []

    for example in test_dataset:
        input_ids = example['input_ids'].unsqueeze(0)
        with torch.no_grad():
            outputs = model(input_ids)
        logits = outputs.logits
        predicted_ids = torch.argmax(logits, dim=2).squeeze().tolist()
        labels = example['labels'].tolist()

        # Filter out tokens with special value -100
        filtered_pred = [p for p, l in zip(predicted_ids, labels) if l != -100]
        filtered_label = [l for l in labels if l != -100]

        # Convert label IDs back to labels
        predicted_labels.extend([model.config.id2label[label_id] for label_id in filtered_pred])
        true_labels.extend([model.config.id2label[label_id] for label_id in filtered_label])

    # Print classification report
    print(seqeval_classification_report([true_labels], [predicted_labels]))

In [187]:
class custom_NER(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Loading the Model

In [218]:
model_path = '../Models/bftm'
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Evaluating with two different Datasets

# Evaluation First Dataset

In [219]:
test_texts, test_tags = read_conll('../Datasets/p5.conll')

# Construct unique tags set and tag-to-id mapping
unique_tags = set(tag for doc in test_tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}


test_encodings = tokenizer(test_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

# Encode tags for test dataset
test_labels = encode_tags(test_tags, test_encodings)

# Create test dataset
test_dataset = custom_NER(test_encodings, test_labels)

# Average length per Corpus

In [220]:
total_length = sum(len(text) for text in test_texts)
average_length = total_length / len(test_texts)
print("Average length:", average_length)


Average length: 891.3333333333334


# Evaluation Table

In [221]:
evaluate_model(model, test_dataset)

              precision    recall  f1-score   support

  ALLOCATION       0.01      0.02      0.01       143
      BUDGET       0.01      0.12      0.02        60
         LOC       0.02      0.23      0.03        53
         ORG       0.01      0.11      0.01        55
        YEAR       0.00      0.00      0.00        65

   micro avg       0.01      0.07      0.02       376
   macro avg       0.01      0.09      0.02       376
weighted avg       0.01      0.07      0.01       376



  _warn_prf(average, modifier, msg_start, len(result))


# Evaluation Second Dataset

In [222]:
test_texts, test_tags = read_conll('../Datasets/test.conll')

# Construct unique tags set and tag-to-id mapping
unique_tags = set(tag for doc in test_tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

# Tokenize test texts
test_encodings = tokenizer(test_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

# Encode tags for test dataset
test_labels = encode_tags(test_tags, test_encodings)

# Create test dataset
test_dataset = custom_NER(test_encodings, test_labels)

In [223]:
total_length = sum(len(text) for text in test_texts)
average_length = total_length / len(test_texts)
print("Average length:", average_length)


Average length: 720.0


In [153]:
# Evaluate the model on the test dataset
evaluate_model(model, test_dataset)


              precision    recall  f1-score   support

  ALLOCATION       0.01      0.23      0.01        26
      BUDGET       0.00      0.25      0.00         4
         LOC       0.00      0.39      0.01        33
         ORG       0.00      0.00      0.00       104
        YEAR       0.00      0.00      0.00        73

   micro avg       0.00      0.08      0.01       240
   macro avg       0.00      0.17      0.00       240
weighted avg       0.00      0.08      0.00       240

