In [1]:
import os
import pandas as pd
from transformers import BertTokenizer

# Initialize tokenizer and label dictionary for conversion to BERT format
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
label2id = {'O': 0, 'B-[FILING_DATE]': 1, 'B-[COMPANY_NAME]': 2, 'I-[COMPANY_NAME]': 3, 'B-[COMPANY_NUMBER]': 4,
            'B-[RETURN_DATE]': 5, 'B-[SIC_CODES]': 6, 'B-[COMPANY_TYPE]': 7, 'B-[REGISTERED_ADDRESS]': 8,
            'I-[REGISTERED_ADDRESS]': 9}  # Add more labels as needed

# Initialize lists to store all data for BERT
input_ids = []
attention_masks = []
label_ids = []

# Define path to training data directory
train_path = os.path.join('datasetfiles', 'train')

# Process each pair of .txt and .csv files in the train folder
for filename in os.listdir(train_path):
    if filename.endswith('.txt'):
        # Load text data and corresponding labels file
        text_file_path = os.path.join(train_path, filename)
        label_file_path = os.path.join(train_path, filename.replace('.txt', '.csv'))

        with open(text_file_path, 'r') as file:
            text_data = file.read()

        labels_df = pd.read_csv(label_file_path)

        # Split the text data into sentences based on double newlines
        sentences = [sentence.strip() for sentence in text_data.split('\n\n') if sentence.strip()]

        # Clean up tokens in the labels DataFrame (remove extra quotes if any)
        labels_df['tokens'] = labels_df['tokens'].str.replace("'", "", regex=False)
        labels_df['labels'] = labels_df['labels'].str.replace("'", "", regex=False)

        # Initialize variables for organizing tokens and labels by sentence
        sentence_token_labels = []
        current_sentence_tokens = []
        current_sentence_labels = []
        sentence_index = 0
        sentence_length = len(sentences)

        # Iterate over each token-label pair in the labels dataframe
        for idx, row in labels_df.iterrows():
            token = row['tokens']
            label = row['labels']

            # Check if token belongs to the current sentence
            if sentence_index < sentence_length and token in sentences[sentence_index]:
                current_sentence_tokens.append(token)
                current_sentence_labels.append(label)
            else:
                # Finalize the current sentence grouping and move to the next
                sentence_token_labels.append((current_sentence_tokens, current_sentence_labels))
                current_sentence_tokens = [token]  # Start new sentence tokens
                current_sentence_labels = [label]
                sentence_index += 1

        # Capture any remaining tokens in the last sentence
        if current_sentence_tokens:
            sentence_token_labels.append((current_sentence_tokens, current_sentence_labels))

        # Convert sentence_token_labels into BERT-compatible format
        for tokens, labels in sentence_token_labels:
            # Tokenize and prepare inputs for BERT
            tokenized_input = tokenizer(tokens, is_split_into_words=True, padding='max_length', truncation=True, max_length=128)
            input_ids.append(tokenized_input['input_ids'])
            attention_masks.append(tokenized_input['attention_mask'])

            # Convert labels to label IDs
            label_id = [label2id.get(label, 0) for label in labels]
            # Pad labels to match max sequence length
            label_id += [0] * (128 - len(label_id))  # Pad to max_length if needed
            label_ids.append(label_id)

# The variables input_ids, attention_masks, and label_ids are now ready for training


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [2]:
import torch
from transformers import BertForTokenClassification, BertTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

# Define custom dataset for PyTorch
class NERDataset(Dataset):
    def __init__(self, input_ids, attention_masks, label_ids):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.label_ids = label_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
            'labels': torch.tensor(self.label_ids[idx], dtype=torch.long)
        }

# Initialize dataset and dataloader
train_dataset = NERDataset(input_ids, attention_masks, label_ids)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Load BERT for token classification
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label2id))
os.environ["WANDB_DISABLED"] = "true"
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset          # training dataset
)

# Train the model
trainer.train()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
10,2.3746
20,2.2749
30,2.0162
40,1.6299
50,0.9768
60,0.3585
70,0.0898
80,0.0243
90,0.0092
100,0.0054


TrainOutput(global_step=1356, training_loss=0.07317735461333072, metrics={'train_runtime': 356.9444, 'train_samples_per_second': 30.332, 'train_steps_per_second': 3.799, 'total_flos': 707316151011840.0, 'train_loss': 0.07317735461333072, 'epoch': 3.0})

In [3]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json')

In [4]:
# Load and process test text and labels
with open('datasetfiles/test/doc_5.txt', 'r') as file:
    test_text_data = file.read()

test_labels_df = pd.read_csv('datasetfiles/test/doc_5.csv')

# Split test text data into sentences
test_sentences = [sentence.strip() for sentence in test_text_data.split('\n\n') if sentence.strip()]

# Clean up tokens in the labels DataFrame (remove extra quotes if any)
test_labels_df['tokens'] = test_labels_df['tokens'].str.replace("'", "", regex=False)
test_labels_df['labels'] = test_labels_df['labels'].str.replace("'", "", regex=False)

# Initialize variables for test token and label alignment
test_sentence_token_labels = []
current_sentence_tokens = []
current_sentence_labels = []
test_sentence_index = 0
test_sentence_length = len(test_sentences)

# Align test tokens with sentences
for idx, row in test_labels_df.iterrows():
    token = row['tokens']
    label = row['labels']

    if test_sentence_index < test_sentence_length and token in test_sentences[test_sentence_index]:
        current_sentence_tokens.append(token)
        current_sentence_labels.append(label)
    else:
        test_sentence_token_labels.append((current_sentence_tokens, current_sentence_labels))
        current_sentence_tokens = [token]
        current_sentence_labels = [label]
        test_sentence_index += 1

# Capture remaining tokens
if current_sentence_tokens:
    test_sentence_token_labels.append((current_sentence_tokens, current_sentence_labels))

# Tokenize and prepare test data for BERT
test_input_ids = []
test_attention_masks = []
test_label_ids = []

for tokens, labels in test_sentence_token_labels:
    tokenized_input = tokenizer(tokens, is_split_into_words=True, padding='max_length', truncation=True, max_length=128)
    test_input_ids.append(tokenized_input['input_ids'])
    test_attention_masks.append(tokenized_input['attention_mask'])

    label_id = [label2id.get(label, 0) for label in labels]
    label_id += [0] * (128 - len(label_id))
    test_label_ids.append(label_id)

# Define test dataset
test_dataset = NERDataset(test_input_ids, test_attention_masks, test_label_ids)

# Evaluate model on test dataset
trainer.evaluate(eval_dataset=test_dataset)

{'eval_loss': 0.0003189194540027529,
 'eval_runtime': 6.1107,
 'eval_samples_per_second': 147.447,
 'eval_steps_per_second': 18.492,
 'epoch': 3.0}

In [5]:
# Define id2label dictionary to map IDs back to label names
id2label = {v: k for k, v in label2id.items()}

# Get predictions using Trainer
predictions = trainer.predict(test_dataset)

# Extract predicted label IDs
pred_label_ids = predictions.predictions.argmax(-1)
true_label_ids = predictions.label_ids

# Convert label IDs to label names
pred_labels = [[id2label[label_id] for label_id in sentence] for sentence in pred_label_ids]
true_labels = [[id2label[label_id] for label_id in sentence] for sentence in true_label_ids]

# Display a few sample predictions with true labels for comparison
for i in range(3):  # Display first 3 examples
    print("Tokens: ", test_sentence_token_labels[i][0])
    print("True Labels: ", true_labels[i])
    print("Predicted Labels: ", pred_labels[i])
    print()

Tokens:  ['Companies', 'House', 'ARO', '1', '(ef)']
True Labels:  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Labels:  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [6]:
# Iterate through each example and display only correct predictions
for i in range(len(test_sentence_token_labels)):
    tokens = test_sentence_token_labels[i][0]
    true_label_sequence = true_labels[i]
    pred_label_sequence = pred_labels[i]

    # Filter correct predictions (where true label matches predicted label)
    correct_predictions = [
        (token, true_label, pred_label)
        for token, true_label, pred_label in zip(tokens, true_label_sequence, pred_label_sequence)
        if true_label == pred_label
    ]

    # Display correct predictions for the current example
    if correct_predictions:
        print(f"Example {i + 1} - Correct Predictions:")
        for token, true_label, pred_label in correct_predictions:
            print(f"Token: {token}, True Label: {true_label}, Predicted Label: {pred_label}")
        print()

Example 1 - Correct Predictions:
Token: Companies, True Label: O, Predicted Label: O
Token: House, True Label: O, Predicted Label: O
Token: ARO, True Label: O, Predicted Label: O
Token: 1, True Label: O, Predicted Label: O
Token: (ef), True Label: O, Predicted Label: O

Example 2 - Correct Predictions:
Token: Annual, True Label: O, Predicted Label: O
Token: Return, True Label: O, Predicted Label: O

Example 3 - Correct Predictions:
Token: Received, True Label: O, Predicted Label: O
Token: for, True Label: O, Predicted Label: O
Token: filing, True Label: O, Predicted Label: O
Token: in, True Label: O, Predicted Label: O
Token: Electronic, True Label: O, Predicted Label: O
Token: Format, True Label: O, Predicted Label: O
Token: on, True Label: O, Predicted Label: O
Token: the:, True Label: O, Predicted Label: O
Token: 13/04/2022, True Label: B-[FILING_DATE], Predicted Label: B-[FILING_DATE]
Token: X4LE264B, True Label: O, Predicted Label: O
Token: Company, True Label: O, Predicted Label:

In [7]:
# Iterate through each example and display only correct predictions (excluding "O" labels)
for i in range(len(test_sentence_token_labels)):
    tokens = test_sentence_token_labels[i][0]
    true_label_sequence = true_labels[i]
    pred_label_sequence = pred_labels[i]

    # Filter correct predictions where true label matches predicted label and is not "O"
    correct_predictions = [
        (token, true_label, pred_label)
        for token, true_label, pred_label in zip(tokens, true_label_sequence, pred_label_sequence)
        if true_label == pred_label and true_label != "O"
    ]

    # Display correct predictions for the current example
    if correct_predictions:
        print(f"Example {i + 1} - Correct Predictions (Excluding 'O'):")
        for token, true_label, pred_label in correct_predictions:
            print(f"Token: {token}, True Label: {true_label}, Predicted Label: {pred_label}")
        print()

Example 3 - Correct Predictions (Excluding 'O'):
Token: 13/04/2022, True Label: B-[FILING_DATE], Predicted Label: B-[FILING_DATE]
Token: ALLIANCE, True Label: B-[COMPANY_NAME], Predicted Label: B-[COMPANY_NAME]
Token: PARTNERS, True Label: I-[COMPANY_NAME], Predicted Label: I-[COMPANY_NAME]
Token: LIMITED, True Label: I-[COMPANY_NAME], Predicted Label: I-[COMPANY_NAME]
Token: 71027, True Label: B-[SIC_CODES], Predicted Label: B-[SIC_CODES]
Token: LIMITED, True Label: B-[COMPANY_TYPE], Predicted Label: B-[COMPANY_TYPE]
Token: Victoria, True Label: I-[REGISTERED_ADDRESS], Predicted Label: I-[REGISTERED_ADDRESS]
Token: Street, True Label: I-[REGISTERED_ADDRESS], Predicted Label: I-[REGISTERED_ADDRESS]
Token: Brighton, True Label: I-[REGISTERED_ADDRESS], Predicted Label: I-[REGISTERED_ADDRESS]
Token: X34, True Label: I-[REGISTERED_ADDRESS], Predicted Label: I-[REGISTERED_ADDRESS]
Token: 8AT, True Label: I-[REGISTERED_ADDRESS], Predicted Label: I-[REGISTERED_ADDRESS]
Token: Kingdom, True La

In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Flatten the lists to calculate metrics at the token level
flattened_true_labels = []
flattened_pred_labels = []

for true_seq, pred_seq in zip(true_labels, pred_labels):
    for true_label, pred_label in zip(true_seq, pred_seq):
        if true_label != "O":  # Exclude "O" for precision, recall, and F1
            flattened_true_labels.append(true_label)
            flattened_pred_labels.append(pred_label)

# Calculate token-level accuracy, including "O" labels
flattened_all_true_labels = [label for seq in true_labels for label in seq]
flattened_all_pred_labels = [label for seq in pred_labels for label in seq]
accuracy = accuracy_score(flattened_all_true_labels, flattened_all_pred_labels)

# Calculate precision, recall, and F1 score for entity labels only
precision = precision_score(flattened_true_labels, flattened_pred_labels, average="weighted")
recall = recall_score(flattened_true_labels, flattened_pred_labels, average="weighted")
f1 = f1_score(flattened_true_labels, flattened_pred_labels, average="weighted")

# Display the metrics
print(f"Accuracy (including 'O' labels): {accuracy:.4f}")
print(f"Precision (entities only): {precision:.4f}")
print(f"Recall (entities only): {recall:.4f}")
print(f"F1 Score (entities only): {f1:.4f}")


Accuracy (including 'O' labels): 0.9999
Precision (entities only): 0.8889
Recall (entities only): 0.7778
F1 Score (entities only): 0.8200


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
import torch
from transformers import BertForTokenClassification, BertTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from torch.utils.data import DataLoader

# Initialize base BERT model for token classification
base_model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label2id))
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the NERDataset class (reused from training code)
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_masks, label_ids):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.label_ids = label_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
            'labels': torch.tensor(self.label_ids[idx], dtype=torch.long)
        }

# Prepare the test dataset and dataloader
test_dataset = NERDataset(test_input_ids, test_attention_masks, test_label_ids)
test_dataloader = DataLoader(test_dataset, batch_size=8)

# Make predictions with the base model
base_model.eval()
pred_labels = []
true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Get model outputs
        outputs = base_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Get predicted labels
        predictions = torch.argmax(logits, dim=-1)

        # Flatten and filter out padding tokens for both predictions and labels
        for i in range(predictions.shape[0]):  # Iterate over each example in the batch
            active_labels = labels[i][attention_mask[i] == 1]  # Only non-padding positions
            active_preds = predictions[i][attention_mask[i] == 1]

            pred_labels.extend(active_preds.cpu().numpy())
            true_labels.extend(active_labels.cpu().numpy())

# Convert numeric labels back to names for calculating metrics
pred_label_names = [id2label[label] for label in pred_labels]
true_label_names = [id2label[label] for label in true_labels]

# Calculate accuracy (including "O" labels)
accuracy = accuracy_score(true_label_names, pred_label_names)

# Calculate precision, recall, and F1 score for entity labels only (excluding "O")
non_o_true_labels = [label for label in true_label_names if label != "O"]
non_o_pred_labels = [label for label, true in zip(pred_label_names, true_label_names) if true != "O"]

precision = precision_score(non_o_true_labels, non_o_pred_labels, average="weighted")
recall = recall_score(non_o_true_labels, non_o_pred_labels, average="weighted")
f1 = f1_score(non_o_true_labels, non_o_pred_labels, average="weighted")

# Display the metrics
print(f"Accuracy (including 'O' labels): {accuracy:.4f}")
print(f"Precision (entities only): {precision:.4f}")
print(f"Recall (entities only): {recall:.4f}")
print(f"F1 Score (entities only): {f1:.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy (including 'O' labels): 0.0476
Precision (entities only): 0.0000
Recall (entities only): 0.0000
F1 Score (entities only): 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
