# Overview of the stuff I tried here

Here's an overview of what I've tried. Some of these things worked well, some of them didn't. I only left the code that's more or less useful.

Stuff I tried:
- using spaCy's matcher to define patterns for detecting specific entities (e.g., dimensions, prices, SKUs).
- enhancing the entity detection by combining matcher results with spaCy’s default entity recognition.
- deduplicating the entities to ensure each unique entity appears only once.
- creating a custom dataset for fine-tuning a NER model, including sentences and entity annotations. Implemented a pipeline for fine-tuning a BERT model for NER tasks using the Hugging Face Transformers library. Defined the label list for various entities such as PRODUCT, BRAND, ORG, MATERIAL, and COLOR.
- tried BERT, DistilBERT, Google Electra, Deberta, ALBERT. BERT and Google Electra gave me the best results. However, the solution is still far from perfect.
- used Optuna to optimize hyperparameters of BERT

# DistilBERT

In [None]:
import re
import json
from transformers import pipeline
import torch

# Advanced text preprocessing function to remove irrelevant text and keep product-like sections
def preprocess_text(text):
    text = re.sub(r'(\b(Subscribe|Sign Up|Newsletter|Privacy|Terms|Customer Service)\b.*)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'(Magazine|Gift|Sweepstakes|Watch Now|Learn More|Sponsored)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\s+', ' ', text).strip()  # Remove excess whitespace
    return text

# Set up the device for inference (GPU/CPU)
device = 0 if torch.cuda.is_available() else -1

# Use a pre-trained NER model
model_name = "elastic/distilbert-base-cased-finetuned-conll03-english"
ner_model = pipeline("ner", model=model_name, aggregation_strategy="simple", device=device)

# Load the text file containing the scraped data
with open('/content/merged_scraped_data.txt', 'r', encoding='utf-8') as file:
    raw_text = file.read()

# Preprocess the text to clean up redundant content
cleaned_text = preprocess_text(raw_text)

# Function to process text and extract product entities
def process_text(text):
    sections = re.split(r'URL:\s+(https?://[^\s]+)', text)
    structured_data = []

    for i in range(1, len(sections), 2):
        url = sections[i].strip()
        content = sections[i + 1].strip()

        # Apply the NER model to extract entities from the content
        entities = ner_model(content)

        # Filter product-like entities (keep "MISC", "ORG", "PER" for broader detection)
        product_entities = [
            {
                "word": entity['word'],
                "entity_group": entity['entity_group'],
                "confidence": float(entity['score'])  # Convert to float
            }
            for entity in entities
            if entity['entity_group'] in ['MISC', 'ORG', 'PER'] and entity['score'] > 0.6
        ]

        if product_entities:  # Only append if we found entities for this section
            structured_data.append({
                "URL": url,
                "products": product_entities
            })
        else:
            print(f"No entities found for URL: {url}")

    return structured_data

# Process the cleaned text and extract product-related information
structured_output = process_text(cleaned_text)

# Save the structured output to a JSON file
output_file = '/content/processed_final_data.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(structured_output, f, ensure_ascii=False, indent=4)

# Print the structured output for verification
print(json.dumps(structured_output, ensure_ascii=False, indent=4))

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
                "confidence": 0.844476580619812
            }
        ]
    },
    {
        "URL": "https://www.amazon.com/HOUSE-Stackable-Organizer-Multifunctional-Combinations/dp/B0B62K55RV?tag=people-onsite-backup-20",
        "products": [
            {
                "word": "Black Nightstand",
                "entity_group": "ORG",
                "confidence": 0.9057246446609497
            },
            {
                "word": "Glass Shade",
                "entity_group": "ORG",
                "confidence": 0.6364215612411499
            },
            {
                "word": "Des",
                "entity_group": "ORG",
                "confidence": 0.6165320873260498
            },
            {
                "word": "##k Lamps",
                "entity_group": "ORG",
                "confidence": 0.6469679474830627
            },
            {
                "word": "##pps & Games A

# Testing the model results on some sample sentences

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Tokenizer and Model Setup
model_name = "elastic/distilbert-base-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name).to(device)

# Initialize the NER pipeline using the loaded model and tokenizer
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=0 if torch.cuda.is_available() else -1)

# Function to extract entities from input text
def extract_entities(text):
    entities = ner_pipeline(text)

    extracted_entities = []
    for entity in entities:
        extracted_entities.append({
            'word': entity['word'],
            'entity_group': entity['entity_group'],
            'confidence': entity['score']
        })
    return extracted_entities

# Test input text directly (replace this with your actual test data)
user_input = """
Novogratz Brittany Sofa Futon - Premium Linen Upholstery and Wooden Legs - Navy Blue

This 32-inch Philips Smart Roku TV has over 4,900 perfect ratings and is on sale at Walmart.
"""

# Extract entities and display results
results = extract_entities(user_input)

# Print the extracted entities
print("Extracted Entities:")
if results:
    for result in results:
        print(f"Word: {result['word']}, Entity Group: {result['entity_group']}, Confidence: {result['confidence']:.4f}")
else:
    print("No entities found.")


Extracted Entities:
Word: Nov, Entity Group: ORG, Confidence: 0.9981
Word: ##og, Entity Group: ORG, Confidence: 0.9978
Word: ##rat, Entity Group: ORG, Confidence: 0.9921
Word: ##z Brittany, Entity Group: ORG, Confidence: 0.7639
Word: Sofa Futon, Entity Group: MISC, Confidence: 0.7157
Word: Line, Entity Group: ORG, Confidence: 0.5919
Word: ##n Upholstery, Entity Group: ORG, Confidence: 0.9297
Word: Wood, Entity Group: ORG, Confidence: 0.9050
Word: ##en, Entity Group: ORG, Confidence: 0.9386
Word: Legs, Entity Group: PER, Confidence: 0.7280
Word: Navy Blue, Entity Group: ORG, Confidence: 0.6799
Word: Philips Smart Roku TV, Entity Group: MISC, Confidence: 0.8245
Word: W, Entity Group: ORG, Confidence: 0.9975
Word: ##al, Entity Group: ORG, Confidence: 0.9973
Word: ##mart, Entity Group: ORG, Confidence: 0.9960


# Google Electra

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Tokenizer and Model Setup
model_name = "microsoft/deberta-v3-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name).to(device)

# Initialize the NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=0 if torch.cuda.is_available() else -1)

# Function to post-process and combine WordPiece tokens
def combine_wordpieces(entities):
    combined_entities = []
    current_entity = {}

    for entity in entities:
        if entity['word'].startswith("##"):  # Continuation of a word
            current_entity['word'] += entity['word'][2:]
            current_entity['score'] = max(current_entity['score'], entity['score'])  # Keep highest confidence score
        else:
            if current_entity:  # Append previous entity if exists
                combined_entities.append(current_entity)
            current_entity = {
                'word': entity['word'],
                'entity_group': entity['entity_group'],
                'score': entity['score']
            }
    if current_entity:  # Add the last entity
        combined_entities.append(current_entity)

    return combined_entities

# Custom function to improve PRODUCT detection
def enhance_product_detection(entities):
    product_keywords = ['sofa', 'chair', 'table', 'futon', 'bed', 'tv', 'lamp', 'rug', 'ottoman', 'stool']

    for entity in entities:
        entity_lower = entity['word'].lower()
        if any(keyword in entity_lower for keyword in product_keywords):
            entity['entity_group'] = 'PRODUCT'  # Reclassify as PRODUCT if matching keywords

    return entities

# Full pipeline: Extract, combine tokens, enhance PRODUCT detection
def extract_entities(text):
    entities = ner_pipeline(text)
    combined_entities = combine_wordpieces(entities)
    enhanced_entities = enhance_product_detection(combined_entities)

    return enhanced_entities

# Example input text
user_input = """
We like the Branch Verve because it’s one of the most comfortable office chairs you can buy, plus it’s attractive and attainably priced.
Fitted with an adjustable seat, armrest and lumbar support, it comes with plenty of features to tailor it
to your comfort preferences and workspace. (We especially recommend the Branch Verve for
shorter people since the seat can be lowered to 16.5 inches.)
Those looking for an office chair with a headrest needn’t fret:
You can purchase one separately and in a color that matches your chair.
Another plus for the Branch Verve: It has a soft-knit back and is available in five sleek colors, making it a practical and beautiful addition to your office space.
"""

# Extract entities and display results
results = extract_entities(user_input)

# Print the extracted entities
print("Extracted Entities:")
if results:
    for result in results:
        print(f"Word: {result['word']}, Entity Group: {result['entity_group']}, Confidence: {result['score']:.4f}")
else:
    print("No entities found.")


Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Extracted Entities:
Word: We like the, Entity Group: LABEL_1, Confidence: 0.6533
Word: Branch, Entity Group: LABEL_0, Confidence: 0.5050
Word: Verve because, Entity Group: LABEL_1, Confidence: 0.5484
Word: it’s one of the most, Entity Group: LABEL_0, Confidence: 0.7570
Word: comfortable, Entity Group: PRODUCT, Confidence: 0.6757
Word: office chairs you can buy, plus it’s, Entity Group: PRODUCT, Confidence: 0.6191
Word: attractive and attain, Entity Group: LABEL_1, Confidence: 0.6011
Word: ably priced. Fitted with an adjustable seat,, Entity Group: PRODUCT, Confidence: 0.6974
Word: armrest, Entity Group: LABEL_1, Confidence: 0.5313
Word: and lumbar support,, Entity Group: LABEL_0, Confidence: 0.6082
Word: it, Entity Group: LABEL_1, Confidence: 0.6365
Word: comes with plenty of, Entity Group: LABEL_0, Confidence: 0.7404
Word: features to tailor it, Entity Group: LABEL_1, Confidence: 0.5922
Word: to your, Entity Group: LABEL_0, Confidence: 0.5676
Word: comfort preferences, Entity Group: L

# BERT

In [None]:
import torch
from transformers import pipeline

# Load the pre-trained BERT-based NER model
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

# Check if GPU is available and set device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Example sentences with furniture-related descriptions
furniture_sentences = [
    "The Lincoln is a smart, Howard style sofa with a tight sprung back and curved arms.",
    "The Novogratz Brittany Sofa Futon comes with wooden legs and a sleek design, available in navy blue.",
    "This Phillips Smart Roku TV is on sale at Walmart.",
    "The sleek Barcelona collection includes leather chairs and elegant sofas with steel frames."
]

# Run NER on furniture sentences
for sentence in furniture_sentences:
    print(f"Sentence: {sentence}")
    entities = ner_pipeline(sentence)

    # Print each entity extracted by the model
    for entity in entities:
        print(f"Word: {entity['word']}, Entity: {entity['entity_group']}, Confidence: {entity['score']:.4f}")
    print("\n")  # Line break between sentences


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Sentence: The Lincoln is a smart, Howard style sofa with a tight sprung back and curved arms.
Word: Lincoln, Entity: MISC, Confidence: 0.7826
Word: Howard, Entity: MISC, Confidence: 0.9721


Sentence: The Novogratz Brittany Sofa Futon comes with wooden legs and a sleek design, available in navy blue.
Word: Novogratz Brittany Sofa Futon, Entity: MISC, Confidence: 0.9687


Sentence: This Phillips Smart Roku TV is on sale at Walmart.
Word: Phillips Smart Roku TV, Entity: ORG, Confidence: 0.9775
Word: Walmart, Entity: ORG, Confidence: 0.9847


Sentence: The sleek Barcelona collection includes leather chairs and elegant sofas with steel frames.
Word: Barcelona, Entity: LOC, Confidence: 0.9981




# Post-processing (might help to refine the outputs of NER models)

In [1]:
import re
from transformers import pipeline

# Load pre-trained NER pipeline
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device=0)

# Define a list of furniture-related keywords for post-processing
furniture_keywords = [
    "sofa", "futon", "table", "chair", "bed", "couch", "ottoman", "dresser",
    "desk", "cabinet", "shelf", "recliner", "bench", "armchair", "nightstand", "wardrobe"
]

# Function to refine NER results with keyword matching
def refine_ner_results(text, entities):
    refined_entities = []
    # Extract NER results from the pipeline
    for entity in entities:
        refined_entities.append({
            'word': entity['word'],
            'entity': entity['entity_group'],
            'confidence': entity['score']
        })

    # Keyword-based post-processing: mark product-related keywords as 'PRODUCT'
    for keyword in furniture_keywords:
        if re.search(rf"\b{keyword}\b", text, re.IGNORECASE):
            refined_entities.append({
                'word': keyword,
                'entity': 'PRODUCT',
                'confidence': 1.0  # High confidence for manual rule
            })

    return refined_entities

# Test sentences
furniture_sentences = [
    "The Lincoln is a smart, Howard style sofa with a tight sprung back and curved arms.",
    "The Novogratz Brittany Sofa Futon comes with wooden legs and a sleek design, available in navy blue.",
    "Floating Shelves for Wall, Wall Mounted Rustic Wood Shelves for Bathroom, Bedroom, Living Room, Kitchen, Hanging Shelf for Books/Storage/Room Decor with 22lbs Capacity (Black, Set of 3, 16in)",
    "That’s why we recommend the Sayl chair from Herman Miller for remote workers with back pain. Built with a unique seat back made from flexible elastomer, the Sayl excels at supporting spine alignment. Plus you can add adjustable lumbar support (for an extra cost).."
]

# Run NER and post-process results
for sentence in furniture_sentences:
    print(f"Sentence: {sentence}")
    entities = ner_pipeline(sentence)
    refined_entities = refine_ner_results(sentence, entities)

    # Print results
    for entity in refined_entities:
        print(f"Word: {entity['word']}, Entity: {entity['entity']}, Confidence: {entity['confidence']:.4f}")
    print("\n")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



Sentence: The Lincoln is a smart, Howard style sofa with a tight sprung back and curved arms.
Word: Lincoln, Entity: MISC, Confidence: 0.7826
Word: Howard, Entity: MISC, Confidence: 0.9721
Word: sofa, Entity: PRODUCT, Confidence: 1.0000


Sentence: The Novogratz Brittany Sofa Futon comes with wooden legs and a sleek design, available in navy blue.
Word: Novogratz Brittany Sofa Futon, Entity: MISC, Confidence: 0.9687
Word: sofa, Entity: PRODUCT, Confidence: 1.0000
Word: futon, Entity: PRODUCT, Confidence: 1.0000


Sentence: Floating Shelves for Wall, Wall Mounted Rustic Wood Shelves for Bathroom, Bedroom, Living Room, Kitchen, Hanging Shelf for Books/Storage/Room Decor with 22lbs Capacity (Black, Set of 3, 16in)
Word: She, Entity: ORG, Confidence: 0.6082
Word: Wall, Entity: LOC, Confidence: 0.9378
Word: Wall, Entity: LOC, Confidence: 0.3936
Word: Mounted Rustic, Entity: MISC, Confidence: 0.7980
Word: Bathroom, Entity: LOC, Confidence: 0.9069
Word: Bedroom, Entity: LOC, Confidence: 0.906

# Finetuning BERT with a small custom furniture dataset

In [None]:
from datasets import load_dataset, DatasetDict

# Load your dataset from the CSV file
dataset = load_dataset("csv", data_files="/content/furniture_ner_dataset.csv")

# Split dataset into train and validation sets (e.g., 80% train, 20% validation)
train_test_split = dataset['train'].train_test_split(test_size=0.2)
dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test'],
})

In [None]:
import ast
import torch
import pandas as pd
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification

# Load custom dataset
dataset = load_dataset("csv", data_files="/content/furniture_ner_dataset.csv")

# Split dataset into train and validation sets
train_test_split = dataset['train'].train_test_split(test_size=0.2)
dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test'],
})

# Define label list
label_list = ["O", "PRODUCT", "BRAND", "ORG", "MATERIAL", "COLOR"]

# Load the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

# Function to parse entity string into a structured format
def parse_entities(entities):
    try:
        # The input format seems to be a string representation of a list, we need to parse it
        entities = ast.literal_eval(entities)
        return [(entity['start'], entity['end'], entity['label']) for entity in entities]
    except (ValueError, SyntaxError, KeyError):
        return []  # Return an empty list if parsing fails

# Tokenization and label alignment function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["sentence"], truncation=True, is_split_into_words=False, padding=True)

    labels = []
    for i, entity_data in enumerate(examples["entities"]):
        entities = parse_entities(entity_data)
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100] * len(tokenized_inputs["input_ids"][i])  # Initialize labels as -100 for special tokens

        # Assign labels for each word based on entity spans
        for start, end, label in entities:
            for word_idx in range(len(word_ids)):
                # Check if token position is in entity range
                if word_ids[word_idx] is not None and word_ids[word_idx] >= start and word_ids[word_idx] < end:
                    label_ids[word_idx] = label_list.index(label)

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization and alignment
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset["train"].column_names)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    logging_dir="./logs"
)

# Trainer setup
data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,1.566775
2,No log,1.382329
3,No log,1.30204


TrainOutput(global_step=6, training_loss=1.4264589945475261, metrics={'train_runtime': 34.3747, 'train_samples_per_second': 1.484, 'train_steps_per_second': 0.175, 'total_flos': 598656601332.0, 'train_loss': 1.4264589945475261, 'epoch': 3.0})

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load your dataset
# Make sure your dataset has a 'tokens' and 'ner_tags' field
dataset = load_dataset('your_dataset')  # Replace with your dataset path

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

# Tokenization and alignment function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Tokenize and align your dataset
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
    save_steps=10_000
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer),
)

# Fine-tune the model
trainer.train()

# Save the model
trainer.save_model("./fine-tuned-bert-product-ner")


In [None]:
# Example furniture sentences after fine-tuning
furniture_sentences = [
    "The Lincoln is a smart, Howard style sofa with a tight sprung back and curved arms.",
    "The Novogratz Brittany Sofa Futon comes with wooden legs and a sleek design, available in navy blue.",
    "This Phillips Smart Roku TV is on sale at Walmart.",
    "The sleek Barcelona collection includes leather chairs and elegant sofas with steel frames."
]

# Run NER on the sentences
for sentence in furniture_sentences:
    print(f"Sentence: {sentence}")
    entities = ner_pipeline(sentence)
    for entity in entities:
        print(f"Word: {entity['word']}, Entity: {entity['entity_group']}, Confidence: {entity['score']:.4f}")
    print("\n")
