In [1]:
from torch.utils.data import Dataset
import torch
from tqdm import tqdm

import json
from transformers import BertTokenizerFast
from torch.utils.data import DataLoader
from transformers import BertConfig

from torch import nn
from transformers import BertModel, BertPreTrainedModel
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("nerdataset00.txt", "r") as f:
    dataset_json = json.load(f)


In [3]:
label2id = {
    "O": 0,
    "B-SUBSCRIPTION": 1,
    "I-SUBSCRIPTION": 2,
    "B-DATE": 3,
    "I-DATE": 4,
    "B-PRICE": 5,
    "I-PRICE": 6
}
id2label = {v: k for k, v in label2id.items()}


In [4]:
 

class NERDataset(Dataset):
    def __init__(self, data, tokenizer, label2id):
        self.data = data
        self.tokenizer = tokenizer
        self.label2id = label2id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        tokens = item["tokens"]
        labels = item["labels"]

        encoding = self.tokenizer(tokens,
                                  is_split_into_words=True,
                                  return_offsets_mapping=True,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=128)

        word_ids = encoding.word_ids()
        label_ids = []

        prev_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != prev_word_idx:
                if word_idx < len(labels):  # ✅ prevent index error
                    label_ids.append(self.label2id[labels[word_idx]])
                else:
                    label_ids.append(-100)  # fallback
            else:
                if word_idx < len(labels):
                    label = labels[word_idx]
                    if label.startswith("B-"):
                        label = label.replace("B-", "I-")
                    label_ids.append(self.label2id[label])
                else:
                    label_ids.append(-100)
            prev_word_idx = word_idx
        encoding.pop("offset_mapping", None)
        encoding["labels"] = label_ids
        return {key: torch.tensor(val) for key, val in encoding.items()}
    



 


In [5]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
dataset = NERDataset(dataset_json, tokenizer, label2id)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [6]:
 
class StrongNERModel(BertPreTrainedModel):
    def __init__(self, config, num_labels):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.init_weights()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            # reshape to (batch_size * seq_len, num_labels)
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

        return (loss, logits) if labels is not None else logits


In [7]:
#from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_config = BertConfig.from_pretrained("bert-base-uncased", num_labels=len(label2id))
#model = StrongNERModel.from_pretrained("bert-base-uncased", config=model_config, num_labels=len(label2id))
model = StrongNERModel(model_config, num_labels=len(label2id))
model.load_state_dict(torch.load("nermodel1.pth", map_location=device))


optimizer = AdamW(model.parameters(), lr=0.00001)
total_steps = len(dataloader) * 5  

  model.load_state_dict(torch.load("nermodel1.pth", map_location=device))


In [8]:
 

def predict_entities(text, model, tokenizer, id2label):
    model.to(device)
    model.eval()
    tokens = text.split()  # or use your own word splitting logic

    encoding = tokenizer(tokens,
                         is_split_into_words=True,
                         return_offsets_mapping=True,
                         return_tensors="pt",
                         truncation=True,
                         padding='max_length',
                         max_length=128)

    with torch.no_grad():
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs   # outputs = (loss, logits)
        predictions = torch.argmax(logits, dim=2)

    # Map back to word-level predictions
    preds = predictions[0] 
    word_ids = encoding.word_ids()

    results = []
    prev_word_idx = None

    for idx, word_idx in enumerate(word_ids):
        if word_idx is None or word_idx == prev_word_idx:
            continue

        label_id = preds[idx]
        label = id2label[label_id.item()]
        word = tokens[word_idx]

        results.append((word, label))
        prev_word_idx = word_idx

    return results


In [17]:
text = "Dear Ege, your Microsoft Azure monthly payment is due for renewal on January 15, 2024. The renewal fee is $15."
entities = predict_entities(text, model, tokenizer, id2label)

for word, label in entities:
    print(f"{word:20} --> {label}")


Dear                 --> O
Ege,                 --> O
your                 --> O
Microsoft            --> B-SUBSCRIPTION
Azure                --> I-SUBSCRIPTION
monthly              --> I-SUBSCRIPTION
payment              --> I-SUBSCRIPTION
is                   --> O
due                  --> O
for                  --> O
renewal              --> O
on                   --> O
January              --> B-DATE
15,                  --> I-DATE
2024.                --> I-DATE
The                  --> O
renewal              --> O
fee                  --> O
is                   --> O
$15.                 --> B-PRICE
