In [None]:
!pip install transformers

In [None]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import transformers
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/gauravsingh7897/NER-Bert/main/ner_dataset.csv", encoding='latin1').fillna(method='ffill')

In [None]:
df.info(), len(df)

In [None]:
enc_tag = LabelEncoder()
df['Tag'] = enc_tag.fit_transform(df['Tag'])
tag_unique = enc_tag.classes_

In [None]:
tag_unique

In [None]:
def preprocess(data):
    tags      = data.groupby('Sentence #')['Tag'].apply(list).values
    sentences = data.groupby('Sentence #')['Word'].apply(list).values
    return sentences, tags
sentences, tags = preprocess(df)

In [None]:
train_sentences, test_sentences, train_tags, test_tags = train_test_split(sentences, tags, test_size=0.15, random_state=42)

In [None]:
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, sentences, tags):
        super(NERDataset, self).__init__()
        self.sentences = sentences
        self.tags = tags
        self.max_len = 256
        self.tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        tag = self.tags[idx]

        ids  = []
        tags = []
        for idx, word in enumerate(sentence):
            encoding = self.tokenizer.encode(word,add_special_tokens=False, max_length=self.max_len, truncation=True)
            ids.extend(encoding)
            tags.extend([tag[idx]] * len(encoding))
        
        ids = [101] + ids + [102]
        masks = [1] * len(ids) 
        token_type_ids = [0] * len(ids)
        tags = [0] + tags + [0]

        pad_len = self.max_len - len(ids)

        if pad_len > 0:
            ids = ids + [0] * pad_len
            masks = masks + [0] * pad_len
            token_type_ids = token_type_ids + [0] * pad_len
            tags = tags + [0] * pad_len
        
        return {
            "ids" : torch.tensor(ids, dtype=torch.long),
            "masks" : torch.tensor(masks, dtype=torch.long),
            "token_type_ids" : torch.tensor(token_type_ids, dtype=torch.long),
            "tags" : torch.tensor(tags, dtype=torch.long)
        }

    def __len__(self):
        return len(self.sentences)

In [None]:
train_dataset = NERDataset(train_sentences, train_tags)
test_dataset   = NERDataset(test_sentences, test_tags)

In [None]:
train_data_loader = torch.utils.data.DataLoader(train_dataset, num_workers=0, batch_size=16)
test_data_loader  = torch.utils.data.DataLoader(test_dataset, num_workers=0, batch_size=8)

In [None]:
model = transformers.BertForTokenClassification.from_pretrained("bert-base-uncased",num_labels=len(tag_unique))
device = torch.device('cuda')
model.to(device)

In [None]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=3e-5, eps=1e-8)

In [None]:
total_steps = int(len(train_sentences) / 16 * 10)

scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
def train(model, data_loader, optimizer, schedular, device):
    model.train()

    total_train_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        ids            = data['ids']
        masks          = data['masks']
        token_type_ids = data['token_type_ids']
        tags           = data['tags']

        ids = ids.to(device)
        masks = masks.to(device)
        token_type_ids = token_type_ids.to(device)
        tags = tags.to(device)

        
        optimizer.zero_grad()

        outputs = model(ids, attention_mask=masks, token_type_ids=token_type_ids, labels=tags)

        loss = outputs[0]
        total_train_loss += loss.item()

        loss.backward()
        
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(data_loader)
    print(f"Average Train Loss : {avg_train_loss}")

In [None]:
def test(model, data_loader, device):
    model.eval()

    total_test_loss = 0
    logits = []
    labels = []

    for data in tqdm(data_loader, total=len(data_loader)):
        ids            = data['ids']
        masks          = data['masks']
        token_type_ids = data['token_type_ids']
        tags           = data['tags']

        ids = ids.to(device)
        masks = masks.to(device)
        token_type_ids = token_type_ids.to(device)
        tags = tags.to(device)

        with torch.no_grad():
            outputs = model(ids, attention_mask=masks, token_type_ids=token_type_ids, labels=tags)

        loss   = outputs[0]

        logits.extend(np.argmax(outputs[1].cpu().detach().numpy(), axis=-1).flatten())
        labels.extend(tags.cpu().detach().numpy().flatten())
        total_test_loss += loss.item()

    avg_test_loss = total_test_loss / len(data_loader)
    print(f"Average Test Loss : {avg_test_loss}")

In [None]:
torch.cuda.empty_cache()
for i in range(10):
    print(f"Epoch : {i+1}")
    train(model, train_data_loader, optimizer, scheduler, device)
    test(model, test_data_loader, device)

In [None]:
test_data = test_dataset.__getitem__(0)
test_out = model(torch.unsqueeze(test_data['ids'].to(device), 0),attention_mask=torch.unsqueeze(test_data['masks'].to(device), 0), token_type_ids=torch.unsqueeze(test_data['token_type_ids'].to(device), 0))
logits = np.argmax(test_out[0].cpu().detach().numpy(), axis=-1).flatten()
labels = test_data['tags'].cpu().detach().numpy()

In [None]:
logits, labels

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
torch.save(model.state_dict(), "drive/MyDrive/saved_model.bin")