In [4]:
import os
import sys
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from datasets import load_metric, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, DataCollatorForTokenClassification, Trainer
import evaluate
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
from huggingface_hub import notebook_login, Repository, get_full_repo_name
import torch
from tqdm.auto import tqdm

In [12]:
def read_data_og(file_path):
    # Initialize lists to store data
    sentence_ids = []
    tokens = []
    # pos_tags = []
    # chunk_tags = []
    ner_tags = []

    # Initialize list to store sentences
    sentences = []

    # Open the file
    with open(file_path, 'r') as file:
        # Initialize sentence ID counter
        sentence_id = 0

        # Initialize lists to store sentence-level data
        sentence_tokens = []
        # sentence_pos_tags = []
        # sentence_chunk_tags = []
        sentence_ner_tags = []

        # Iterate through lines
        for line in file:
            line = line.strip()

            # Skip empty lines
            if not line:
                if sentence_tokens:  # If the sentence has tokens
                    # Append sentence data to lists
                    sentence_ids.append(sentence_id)
                    tokens.append(sentence_tokens)
                    # pos_tags.append(sentence_pos_tags)
                    # chunk_tags.append(sentence_chunk_tags)
                    ner_tags.append(sentence_ner_tags)
                    sentences.append(sentence_tokens)  # Add to sentences list

                    # Reset for the next sentence
                    sentence_tokens = []
                    # sentence_pos_tags = []
                    # sentence_chunk_tags = []
                    sentence_ner_tags = []
                sentence_id += 1  # Increment sentence ID
                continue

            # # Skip the initial -DOCSTART- line
            # if line.startswith('-DOCSTART-'):
            #     continue

            # Split line by whitespace
            parts = line.split()

            # Extract data
            token = parts[0]
            ner_tag = parts[3]

            # Append data to sentence-level lists
            sentence_tokens.append(token)
            # sentence_pos_tags.append(0)  # Append 0 for pos_tags
            # sentence_chunk_tags.append(0)  # Append 0 for chunk_tags
            sentence_ner_tags.append(ner_tag)

    # Create DataFrame
    data = {
        'sentence_id': sentence_ids,
        'tokens': tokens,
        # 'pos_tags': pos_tags,
        # 'chunk_tags': chunk_tags,
        'ner_tags': ner_tags
    }
    df = pd.DataFrame(data)

    return df, sentences

In [13]:
# Not necessary for now

# Define dataset paths
traindatapath = "data/train.txt"
devdatapath = "data/dev.txt"
testdatapath = "data/test.txt"

train_df, train_sentences = read_data_og(traindatapath)
val_df, val_sentences = read_data_og(devdatapath)
test_df, test_sentences = read_data_og(testdatapath)

In [4]:
# unique ner tags
label_names = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
label2id_int = {'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}


In [5]:
# Load the BERT tokenizer
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    previous_word_idx = None

    for word_id in word_ids:
        
        if word_id == None:
            current_word = word_id
            new_labels.append(-100) # -100 is ignored
        
        elif word_id != previous_word_idx:
            label = labels[word_id]
            new_labels.append(label)

        else:
            new_labels.append(-100)
        previous_word_idx = word_id

    return new_labels

In [7]:
def tokenize_and_align_labels(data):
    tokenized_inputs = tokenizer(data['tokens'], truncation=True, is_split_into_words=True)

    all_labels = data['ner_tags']
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [14]:
def map_ner_tags_to_ids(data):
    data['ner_tags'] = [[label2id_int[tag] for tag in tags] for tags in data['ner_tags']]
    return data

In [15]:
train_df, train_sentences = read_data_og(traindatapath)
val_df, val_sentences = read_data_og(devdatapath)
test_df, test_sentences = read_data_og(testdatapath)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
train_dataset = train_dataset.map(map_ner_tags_to_ids, batched=True)
test_dataset = test_dataset.map(map_ner_tags_to_ids, batched=True)
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14987 [00:00<?, ? examples/s]

Map:   0%|          | 0/3684 [00:00<?, ? examples/s]

Map:   0%|          | 0/14987 [00:00<?, ? examples/s]

Map:   0%|          | 0/3684 [00:00<?, ? examples/s]

# Dataloader

In [17]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [15]:
# train_loader = DataLoader(tokenized_datasets["train"], shuffle = True, batch_size=8, collate_fn=data_collator)
# val_loader = DataLoader(tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator)
# test_loader = DataLoader(tokenized_datasets["test"], batch_size=8, collate_fn=data_collator)

# Model

In [18]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {label: i for i, label in id2label.items()}

In [19]:
model = AutoModelForTokenClassification.from_pretrained(model_name,id2label = id2label, label2id=label2id, num_labels=len(label2id))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Results

In [20]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Convert labels to a list of lists if it's a set
    if isinstance(labels, set):
        labels = [labels]

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# Optimizer

In [21]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [22]:
print(model.parameters())
print(optimizer)

<generator object Module.parameters at 0x00000200390B9540>
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 2e-05
    maximize: False
    weight_decay: 0.01
)


# Accelerator & LR

In [23]:
# accelerator = Accelerator()

# model, optimizer, train_loader, val_loader = accelerator.prepare(
#     model, optimizer, train_loader, val_loader
# )

In [24]:
# num_train_epochs = 5
# num_update_steps_per_epoch = len(train_loader)
# num_training_steps = num_train_epochs * len(train_loader)

# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps
# )

# Huggingface

In [25]:
# notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
# model_saved = "bert-finetuned-ner-Raivis"
# repo_name = get_full_repo_name(model_saved)
# repo_name

'raiviswastaken/bert-finetuned-ner-Raivis'

In [27]:
# os.environ["HF_HOME"] = "true"
# output_dir = "bert-finetuned-ner-Raivis"
# # repo = Repository(output_dir, clone_from=repo_name)

# Training

In [23]:
model_saved = "bert-finetuned-ner-Raivis"
args = TrainingArguments(
    model_saved,
    evaluation_strategy="epoch",
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False,
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = load_metric("seqeval")

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics,
    optimizers = (optimizer, None)
)
trainer.train()

  metric = load_metric("seqeval")
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/4685 [00:00<?, ?it/s]

Checkpoint destination directory bert-finetuned-ner-Raivis\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.1587, 'learning_rate': 1.7865528281750267e-05, 'epoch': 0.53}


  0%|          | 0/231 [00:00<?, ?it/s]

{'eval_loss': 0.09059310704469681, 'eval_precision': 0.8983021179765447, 'eval_recall': 0.9086402266288952, 'eval_f1': 0.9034415984508405, 'eval_accuracy': 0.9817425963228046, 'eval_runtime': 19.0902, 'eval_samples_per_second': 192.978, 'eval_steps_per_second': 12.1, 'epoch': 1.0}


Checkpoint destination directory bert-finetuned-ner-Raivis\checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0477, 'learning_rate': 1.5731056563500536e-05, 'epoch': 1.07}


Checkpoint destination directory bert-finetuned-ner-Raivis\checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0272, 'learning_rate': 1.3596584845250803e-05, 'epoch': 1.6}


  0%|          | 0/231 [00:00<?, ?it/s]

{'eval_loss': 0.10664570331573486, 'eval_precision': 0.8982285515804098, 'eval_recall': 0.9157223796033994, 'eval_f1': 0.9068911099421356, 'eval_accuracy': 0.981978313975914, 'eval_runtime': 28.6825, 'eval_samples_per_second': 128.441, 'eval_steps_per_second': 8.054, 'epoch': 2.0}


Checkpoint destination directory bert-finetuned-ner-Raivis\checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.022, 'learning_rate': 1.146211312700107e-05, 'epoch': 2.13}


Checkpoint destination directory bert-finetuned-ner-Raivis\checkpoint-2500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0139, 'learning_rate': 9.327641408751335e-06, 'epoch': 2.67}


  0%|          | 0/231 [00:00<?, ?it/s]

{'eval_loss': 0.12913185358047485, 'eval_precision': 0.8957936645317639, 'eval_recall': 0.9162535410764873, 'eval_f1': 0.9059080962800876, 'eval_accuracy': 0.9813783054043629, 'eval_runtime': 31.2197, 'eval_samples_per_second': 118.002, 'eval_steps_per_second': 7.399, 'epoch': 3.0}


Checkpoint destination directory bert-finetuned-ner-Raivis\checkpoint-3000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0117, 'learning_rate': 7.193169690501601e-06, 'epoch': 3.2}


Checkpoint destination directory bert-finetuned-ner-Raivis\checkpoint-3500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0078, 'learning_rate': 5.058697972251868e-06, 'epoch': 3.74}


  0%|          | 0/231 [00:00<?, ?it/s]

{'eval_loss': 0.12599104642868042, 'eval_precision': 0.9030398322851153, 'eval_recall': 0.9151912181303116, 'eval_f1': 0.9090749208582483, 'eval_accuracy': 0.982256889384134, 'eval_runtime': 30.1141, 'eval_samples_per_second': 122.335, 'eval_steps_per_second': 7.671, 'epoch': 4.0}


Checkpoint destination directory bert-finetuned-ner-Raivis\checkpoint-4000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0069, 'learning_rate': 2.924226254002135e-06, 'epoch': 4.27}


Checkpoint destination directory bert-finetuned-ner-Raivis\checkpoint-4500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0045, 'learning_rate': 7.897545357524014e-07, 'epoch': 4.8}


  0%|          | 0/231 [00:00<?, ?it/s]

{'eval_loss': 0.13532161712646484, 'eval_precision': 0.9015966678236723, 'eval_recall': 0.9197946175637394, 'eval_f1': 0.9106047326906223, 'eval_accuracy': 0.9824068915270218, 'eval_runtime': 27.3286, 'eval_samples_per_second': 134.804, 'eval_steps_per_second': 8.453, 'epoch': 5.0}
{'train_runtime': 1956.1325, 'train_samples_per_second': 38.308, 'train_steps_per_second': 2.395, 'train_loss': 0.032211088943379665, 'epoch': 5.0}


TrainOutput(global_step=4685, training_loss=0.032211088943379665, metrics={'train_runtime': 1956.1325, 'train_samples_per_second': 38.308, 'train_steps_per_second': 2.395, 'train_loss': 0.032211088943379665, 'epoch': 5.0})

In [24]:
model.save_pretrained("models/bert-ner-Raivis")

In [25]:
predictions, labels, metrics = trainer.predict(tokenized_test_dataset)
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

  0%|          | 0/231 [00:00<?, ?it/s]

{'LOC': {'precision': 0.9306634787806336,
  'recall': 0.9334532374100719,
  'f1': 0.9320562705776714,
  'number': 1668},
 'MISC': {'precision': 0.7776280323450134,
  'recall': 0.8219373219373219,
  'f1': 0.799168975069252,
  'number': 702},
 'ORG': {'precision': 0.8704663212435233,
  'recall': 0.9102950030102348,
  'f1': 0.8899352560329605,
  'number': 1661},
 'PER': {'precision': 0.962111801242236,
  'recall': 0.9579468150896723,
  'f1': 0.9600247908273939,
  'number': 1617},
 'overall_precision': 0.9015966678236723,
 'overall_recall': 0.9197946175637394,
 'overall_f1': 0.9106047326906223,
 'overall_accuracy': 0.9824068915270218}

In [26]:
def convert_to_output(sentences, predictions):
    formatted_data = []
    for sentence, prediction in zip(sentences, predictions):
        for word, tag in zip(sentence, prediction):
            formatted_data.append([word, tag])
        # Add an empty line after each sentence
        formatted_data.append(["", ""])
    return formatted_data

formatted_output = convert_to_output(test_sentences, true_predictions)

# Save formatted data to output file in format word tag, sentences are separated by empty line
def save_to_output_file(formatted_data, output_file):
    with open(output_file, 'w') as file:
        for data in formatted_data:
            file.write(data[0] + '\t' + data[1] + '\n')
        file.write('\n')  # Add an empty line after each sentence

outputname = "bert_gold_22052024.txt"
save_to_output_file(formatted_output, outputname)

In [3]:
bert_InstanceScores = getInstanceScores("bert_gold_22052024.txt", "data/gold.txt")
print(bert_InstanceScores)

0.9143058243885273


# Noise injection

In [190]:
noise_rates = {
    'capitalization_swap': [0.1, 0.15, 0.2, 0.25, 0.3], 
    'character_swap': [0.1, 0.15, 0.2, 0.25, 0.3],
    'character_removal': [0.1, 0.15, 0.2, 0.25, 0.3],
    'character_replacement': [0.1, 0.15, 0.2, 0.25, 0.3]
}

In [200]:
def read_data_grouped(file_path):
    # Initialize lists to store data
    sentence_ids = []
    tokens = []
    pos_tags = []
    chunk_tags = []
    ner_tags = []

    # Initialize list to store sentences
    sentences = []

    # Open the file
    with open(file_path, 'r') as file:
        # Initialize sentence ID counter
        sentence_id = 0

        # Initialize lists to store sentence-level data
        sentence_tokens = []
        sentence_pos_tags = []
        sentence_chunk_tags = []
        sentence_ner_tags = []

        # Iterate through lines
        for line in file:
            line = line.strip()

            # Skip empty lines
            if not line:
                if sentence_tokens:  # If the sentence has tokens
                    # Append sentence data to lists
                    sentence_ids.append(sentence_id)
                    tokens.append(sentence_tokens)
                    pos_tags.append(sentence_pos_tags)
                    chunk_tags.append(sentence_chunk_tags)
                    ner_tags.append(sentence_ner_tags)
                    sentences.append(sentence_tokens)  # Add to sentences list

                    # Reset for the next sentence
                    sentence_tokens = []
                    sentence_pos_tags = []
                    sentence_chunk_tags = []
                    sentence_ner_tags = []
                sentence_id += 1  # Increment sentence ID
                continue

            # # Skip the initial -DOCSTART- line
            # if line.startswith('-DOCSTART-'):
            #     continue

            # Split line by whitespace
            parts = line.split()

            # Extract data
            token = parts[0]
            ner_tag = parts[1]

            # Append data to sentence-level lists
            sentence_tokens.append(token)
            sentence_pos_tags.append(0)  # Append 0 for pos_tags
            sentence_chunk_tags.append(0)  # Append 0 for chunk_tags
            sentence_ner_tags.append(ner_tag)

    # Create DataFrame
    data = {
        'sentence_id': sentence_ids,
        'tokens': tokens,
        'pos_tags': pos_tags,
        'chunk_tags': chunk_tags,
        'ner_tags': ner_tags
    }
    df = pd.DataFrame(data)

    return df, sentences

In [None]:
for noise_type in noise_rates.keys():
    for rate in noise_rates[noise_type]:

        path = f'data/altered/{noise_type}_rate_{rate}.txt'
        outpath = f'predictions/altered/bert/{noise_type}_rate_{rate}.txt'

        test_df , sentences = read_data_grouped(path)
        test_dataset = Dataset.from_pandas(test_df)
        test_dataset = test_dataset.map(map_ner_tags_to_ids, batched=True)
        tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

        predictions, labels, metrics = trainer.predict(tokenized_test_dataset)
        predictions = np.argmax(predictions, axis=2)

        true_predictions = [
            [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        output = convert_to_output(sentences, true_predictions)
        save_to_output_file(output,outpath)

In [2]:
# Rob

def readBIO(path):
    ents = []
    curEnts = []
    for line in open(path):
        line = line.strip()
        if line == '':
            ents.append(curEnts)
            curEnts = []
        elif line[0] == '#' and len(line.split('\t')) == 1:
            continue
        else:
            curEnts.append(line.split('\t')[1])
    return ents

def toSpans(tags):
    spans = set()
    for beg in range(len(tags)):
        if tags[beg][0] == 'B':
            end = beg
            for end in range(beg+1, len(tags)):
                if tags[end][0] != 'I':
                    break
            spans.add(str(beg) + '-' + str(end) + ':' + tags[beg][2:])
            #print(end-beg)
    return spans

def getInstanceScores(predPath, goldPath):
    goldEnts = readBIO(goldPath)
    predEnts = readBIO(predPath)
    entScores = []
    tp = 0
    fp = 0
    fn = 0
    for goldEnt, predEnt in zip(goldEnts, predEnts):
        goldSpans = toSpans(goldEnt)
        predSpans = toSpans(predEnt)
        overlap = len(goldSpans.intersection(predSpans))
        tp += overlap
        fp += len(predSpans) - overlap
        fn += len(goldSpans) - overlap
        
    prec = 0.0 if tp+fp == 0 else tp/(tp+fp)
    rec = 0.0 if tp+fn == 0 else tp/(tp+fn)
    f1 = 0.0 if prec+rec == 0.0 else 2 * (prec * rec) / (prec + rec)
    return f1

In [203]:
types = []
rates = []
f1_score = []

for noise_type in noise_rates.keys():
    for rate in noise_rates[noise_type]:

        score = getInstanceScores(f"predictions/altered/bert/{noise_type}_rate_{rate}.txt", "data/gold.txt")
        
        # Append data to lists
        types.append(noise_type)
        rates.append(rate)
        f1_score.append(score)

results = {'Type': types, 'Rate': rates, 'F1 Score': f1_score}
df = pd.DataFrame(results)
print(df)

df.to_csv('out/df_altered_bert.csv', index=False)

                     Type  Rate  F1 Score
0     capitalization_swap  0.10  0.836195
1     capitalization_swap  0.15  0.802807
2     capitalization_swap  0.20  0.773558
3     capitalization_swap  0.25  0.733510
4     capitalization_swap  0.30  0.706244
5          character_swap  0.10  0.900361
6          character_swap  0.15  0.788925
7          character_swap  0.20  0.638737
8          character_swap  0.25  0.569830
9          character_swap  0.30  0.546829
10      character_removal  0.10  0.901050
11      character_removal  0.15  0.781415
12      character_removal  0.20  0.622951
13      character_removal  0.25  0.549648
14      character_removal  0.30  0.515959
15  character_replacement  0.10  0.901374
16  character_replacement  0.15  0.792342
17  character_replacement  0.20  0.635154
18  character_replacement  0.25  0.579648
19  character_replacement  0.30  0.558111
