<a href="https://colab.research.google.com/github/ebarkhordar/Clustering-of-Bank-Customers/blob/main/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch transformers datasets tqdm



In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
%cd /content/gdrive/MyDrive/TurkishTweetNER

/content/gdrive/MyDrive/TurkishTweetNER


In [11]:
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from transformers import BertTokenizer, BertForTokenClassification
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW

In [5]:
class NERDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.sentences = []
        self.labels = []
        self.label_map = {"B-LOC": 0, "I-LOC": 1, "B-ORG": 2, "I-ORG": 3, "B-PER": 4, "I-PER": 5, "O": 6}

        with open(file_path, "r", encoding="utf-8") as file:
            sentence, label = [], []
            for line in file:
                if line == "\n":
                    if sentence and label:
                        self.sentences.append(sentence)
                        self.labels.append(label)
                        sentence, label = [], []
                else:
                    word, tag = line.strip().split()
                    sentence.append(word)
                    label.append(self.label_map[tag])

        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        tokens = self.sentences[idx]
        labels = self.labels[idx]

        encoding = self.tokenizer(tokens, is_split_into_words=True, return_tensors="pt", padding="max_length",
                                  truncation=True, max_length=self.max_length)
        encoding["labels"] = torch.tensor(labels[:self.max_length], dtype=torch.long).unsqueeze(0)
        return encoding

tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-uncased')
train_dataset = NERDataset("datasets/wiki-ann/train.txt", tokenizer, 128)
val_dataset = NERDataset("datasets/wiki-ann/dev.txt", tokenizer, 128)

In [6]:
model = BertForTokenClassification.from_pretrained('dbmdz/bert-base-turkish-uncased', num_labels=7)

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
class PaddingCollateFunction:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        max_len = max([len(item['input_ids'][0]) for item in batch])
        padded_input_ids = []
        padded_attention_masks = []
        padded_labels = []
        for item in batch:
            padded_input_ids.append(
                torch.cat([item['input_ids'][0], torch.tensor([self.tokenizer.pad_token_id] * (max_len - len(item['input_ids'][0])))])
            )
            padded_attention_masks.append(
                torch.cat([item['attention_mask'][0], torch.tensor([0] * (max_len - len(item['attention_mask'][0])))])
            )
            padded_labels.append(
                torch.cat([item['labels'][0], torch.tensor([-100] * (max_len - len(item['labels'][0])))])
            )
        return {
            'input_ids': torch.stack(padded_input_ids).to(torch.long),
            'attention_mask': torch.stack(padded_attention_masks).to(torch.long),
            'labels': torch.stack(padded_labels).to(torch.long)
        }


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=3e-5)
collate_fn = PaddingCollateFunction(tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

label_map = {i: label for label, i in train_dataset.label_map.items()}

def evaluate(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(inputs, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits

            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()

            predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
            true_labels.extend(label_ids)

    TP, FP, FN = defaultdict(int), defaultdict(int), defaultdict(int)
    for preds, trues in zip(predictions, true_labels):
        for p, t in zip(preds, trues):
            if t != -100:  # Add this line to check for -100 label
                if label_map[t] != "PAD":
                    if p == t:
                        TP[label_map[p]] += 1
                    else:
                        FP[label_map[p]] += 1
                        FN[label_map[t]] += 1

    precision = {tag: TP[tag] / (TP[tag] + FP[tag]) for tag in label_map.values() if tag != "PAD"}
    recall = {tag: TP[tag] / (TP[tag] + FN[tag]) for tag in label_map.values() if tag != "PAD"}
    f1 = {tag: 2 * precision[tag] * recall[tag] / (precision[tag] + recall[tag]) for tag in label_map.values() if tag != "PAD"}

    return sum(precision.values())/len(precision), sum(recall.values())/len(recall), sum(f1.values())/len(f1)


for epoch in range(3):  # Number of epochs
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):  # adding tqdm for progress bar
        optimizer.zero_grad()
        inputs = batch['input_ids'].squeeze().to(device)
        attention_mask = batch['attention_mask'].squeeze().to(device)
        labels = batch['labels'].squeeze().to(device)

        outputs = model(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Training Loss: {total_loss/len(train_loader)}")

    precision, recall, f1 = evaluate(model, val_loader, device)
    print(f"Validation Precision (avg): {precision}")
    print(f"Validation Recall (avg): {recall}")
    print(f"Validation F1-Score (avg): {f1}")

Epoch 1 Training: 100%|██████████| 1250/1250 [02:33<00:00,  8.15it/s]


Epoch 1, Training Loss: 0.127932095111534


Evaluating: 100%|██████████| 625/625 [00:28<00:00, 21.61it/s]


Validation Precision (avg): 0.9005072675492147
Validation Recall (avg): 0.8859856502166981
Validation F1-Score (avg): 0.8930515689141314


Epoch 2 Training: 100%|██████████| 1250/1250 [02:33<00:00,  8.16it/s]


Epoch 2, Training Loss: 0.10225712111387401


Evaluating: 100%|██████████| 625/625 [00:28<00:00, 21.80it/s]


Validation Precision (avg): 0.9046865942764566
Validation Recall (avg): 0.8832257825674784
Validation F1-Score (avg): 0.8936494411176197


Epoch 3 Training: 100%|██████████| 1250/1250 [02:33<00:00,  8.14it/s]


Epoch 3, Training Loss: 0.08382467718310654


Evaluating: 100%|██████████| 625/625 [00:28<00:00, 21.61it/s]


Validation Precision (avg): 0.89160891991311
Validation Recall (avg): 0.8770127094561583
Validation F1-Score (avg): 0.8835110130937204
