In [2]:
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from datasets import load_metric, Dataset
import numpy as np

2024-10-06 11:39:40.689123: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-06 11:39:40.773704: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=3)
metric = load_metric("seqeval")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("seqeval")


In [4]:
def read_bio_file(file_path):
    sentences = []
    labels = []
    sentence = []
    label = []

    with open(file_path, 'r') as f:
        for line in f:
            # Strip any extra whitespace from the line
            line = line.strip()

            # If we encounter an empty line, we finish the current sentence
            if not line:
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence = []
                    label = []
            else:
                # Split the word and the label by space
                word, tag = line.split()
                sentence.append(word)
                label.append(tag)

        # Catch any final sentence that might not end with a newline
        if sentence:
            sentences.append(sentence)
            labels.append(label)

    return sentences, labels

from datasets import Dataset

# Load the sentences and labels from the BIO file
sentences, labels = read_bio_file('../src/output_bio.txt')

# Create a dataset dictionary with 'text' and 'labels'
dataset_dict = {
    'text': sentences,
    'labels': labels
}

# Convert the dictionary to Hugging Face Dataset format
dataset = Dataset.from_dict(dataset_dict)

print(dataset[0])

{'text': ['Rusitc', 'Two', 'Tone', 'Gathering', 'Table', 'and', 'Barstools', '-', 'Allwood', 'Furniture', 'Menu', 'Home', 'Catalog', 'All', 'Collections', 'Barstool', 'Collection', 'Dining', 'Collection', 'Pub', 'Collection', 'Amish', 'Collection', 'Sizing', 'Contact', 'Order', 'Form', 'Continue', 'Shopping', 'Your', 'Cart', 'is', 'Empty', 'SUBMIT', 'ORDERS', 'AND', 'PRODUCT', 'ISSUES', 'WITH', 'PICTURE', 'VIA', 'EMAIL:', 'support@allwoodfurn.com', 'Allwood', 'Furniture', 'Home', 'Catalog', '▾', 'All', 'Collections', 'Barstool', 'Collection', 'Dining', 'Collection', 'Pub', 'Collection', 'Amish', 'Collection', 'Sizing', 'Contact', 'Order', 'Form', 'Allwood', 'Furniture', 'Home', '/', 'Products', '/', 'Group', '#119', 'Group', '#119', '5600/5601', 'Rustic', 'Two', 'Tone', 'Gathering', 'Table', 'and', 'Barstools', 'Table:', '5600', '*', '42”W', 'x', '60/78”D', 'x', '36”H', '*', 'Rustic', 'Two-Tone', 'Rectangular', 'Gathering', 'Table', 'w/Self-Storing', 'Butterfly', 'Leaf', 'Barstool:', '

In [5]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    # Tokenize the input text
    tokenized_inputs = tokenizer(
        examples['text'],  # Tokenizing the 'text' field
        padding='max_length',  # Pad to max length
        truncation=True,  # Truncate sequences if they exceed max length
        max_length=128,  # Set a max length for the sequences
        is_split_into_words=True,  # Use this because we are dealing with word-level inputs
    )

    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word IDs for the tokenized inputs
        aligned_labels = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:  # Special tokens like [CLS] or [SEP]
                aligned_labels.append(-100)  # -100 will be ignored in the loss function
            elif word_idx != previous_word_idx:  # First token of a word
                aligned_labels.append(label[word_idx])
            else:  # Subword token
                aligned_labels.append(label[word_idx] if label_all_tokens else -100)

            previous_word_idx = word_idx

        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels  # Add aligned labels to the tokenized inputs
    return tokenized_inputs



from datasets import Dataset

# Create label-to-ID mapping
label_list = ['O', 'B-PRODUCT', 'I-PRODUCT']  # Add all possible labels here
label_to_id = {label: idx for idx, label in enumerate(label_list)}

# Function to encode string labels into integers
def encode_labels(labels, label_to_id):
    return [[label_to_id[label] for label in sentence_labels] for sentence_labels in labels]

from sklearn.model_selection import train_test_split

# Split the data into training and validation sets (80% train, 20% val)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    dataset['text'], dataset['labels'], test_size=0.2, random_state=42
)

# Create the train_data and val_data dictionaries
train_data = {
    'text': train_texts,
    'labels': encode_labels(train_labels, label_to_id)
}

val_data = {
    'text': val_texts,
    'labels': encode_labels(val_labels, label_to_id)
}

# Convert the data to Dataset format (train and validation sets)
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)
train_dataset = train_dataset.map(lambda examples: tokenize_and_align_labels(examples, label_all_tokens=True), batched=True)
val_dataset = val_dataset.map(lambda examples: tokenize_and_align_labels(examples, label_all_tokens=True), batched=True)


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

In [6]:
# Set up the Trainer
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.411011
2,No log,0.413168
3,No log,0.403532


TrainOutput(global_step=15, training_loss=0.3847005526224772, metrics={'train_runtime': 89.3423, 'train_samples_per_second': 2.686, 'train_steps_per_second': 0.168, 'total_flos': 15677947146240.0, 'train_loss': 0.3847005526224772, 'epoch': 3.0})

In [7]:
from sklearn.metrics import classification_report

# Load the trained model and evaluate
predictions, labels, _ = trainer.predict(val_dataset)

# Get the predicted class index for each token
predictions = np.argmax(predictions, axis=2)

# Flatten the true and predicted labels and remove the ignored index (-100)
true_labels = []
pred_labels = []

for i in range(len(labels)):
    true_label = [label_list[l] for l in labels[i] if l != -100]  # True labels without ignored tokens
    pred_label = [label_list[p] for j, p in enumerate(predictions[i]) if labels[i][j] != -100]  # Predictions without ignored tokens
    true_labels.extend(true_label)  # Extend instead of append to flatten
    pred_labels.extend(pred_label)  # Extend instead of append to flatten

# Now print the classification report
print(classification_report(true_labels, pred_labels, zero_division=0))

              precision    recall  f1-score   support

   B-PRODUCT       0.00      0.00      0.00        52
   I-PRODUCT       0.00      0.00      0.00       188
           O       0.90      1.00      0.95      2198

    accuracy                           0.90      2438
   macro avg       0.30      0.33      0.32      2438
weighted avg       0.81      0.90      0.85      2438



In [9]:
sentence = "Ada Sofa is our new product."

# Tokenize the sentence
inputs = tokenizer(sentence, return_tensors="pt")
outputs = model(**inputs).logits

# Get the predicted token labels
predicted_token_class_ids = np.argmax(outputs.detach().numpy(), axis=-1)
predicted_labels = [model.config.id2label[t] for t in predicted_token_class_ids[0]]

# Display the predictions
print(predicted_labels)

['LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0', 'LABEL_0']
