In [None]:
from torch.utils.data import Dataset
import torch
from tqdm import tqdm

import json
from transformers import BertTokenizerFast
from torch.utils.data import DataLoader
from transformers import BertConfig

from torch import nn
from transformers import BertModel, BertPreTrainedModel
from sklearn.metrics import classification_report

In [None]:
label2id = {
    "O": 0,
    "B-SUBSCRIPTION": 1,
    "I-SUBSCRIPTION": 2,
    "B-DATE": 3,
    "I-DATE": 4,
    "B-PRICE": 5,
    "I-PRICE": 6
}
id2label = {v: k for k, v in label2id.items()}


In [None]:
 
with open("nerdataset00.txt", "r") as f:
    dataset_json = json.load(f)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [None]:
 

class NERDataset(Dataset):
    def __init__(self, data, tokenizer, label2id):
        self.data = data
        self.tokenizer = tokenizer
        self.label2id = label2id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        tokens = item["tokens"]
        labels = item["labels"]

        encoding = self.tokenizer(tokens,
                                  is_split_into_words=True,
                                  return_offsets_mapping=True,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=128)

        word_ids = encoding.word_ids()
        label_ids = []

        prev_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != prev_word_idx:
                if word_idx < len(labels): 
                    label_ids.append(self.label2id[labels[word_idx]])
                else:
                    label_ids.append(-100)  # fallback
            else:
                if word_idx < len(labels):
                    label = labels[word_idx]
                    if label.startswith("B-"):
                        label = label.replace("B-", "I-")
                    label_ids.append(self.label2id[label])
                else:
                    label_ids.append(-100)
            prev_word_idx = word_idx
        encoding.pop("offset_mapping", None)
        encoding["labels"] = label_ids
 
        return {key: torch.tensor(val) for key, val in encoding.items()}
    



 


In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
dataset = NERDataset(dataset_json, tokenizer, label2id)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [None]:
 
class StrongNERModel(BertPreTrainedModel):
    def __init__(self, config, num_labels):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.init_weights()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            # reshape to (batch_size * seq_len, num_labels)
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

        return (loss, logits) if labels is not None else logits


In [None]:
#from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_config = BertConfig.from_pretrained("bert-base-uncased", num_labels=len(label2id))
model = StrongNERModel.from_pretrained("bert-base-uncased", config=model_config, num_labels=len(label2id))
model.to(device)

optimizer = AdamW(model.parameters(), lr=0.0000001)
total_steps = len(dataloader) * 5  
#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1 * total_steps, num_training_steps=total_steps)

# Training Loop
for epoch in range(45):
    model.train()
    total_loss = 0
    loop = tqdm(dataloader, desc=f"Epoch {epoch+1}")
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        model.zero_grad()
        loss, logits = model(input_ids, attention_mask, labels=labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} average loss: {total_loss / len(dataloader):.4f}")




In [None]:
torch.save(model.state_dict(), 'nermodel1.pth')