In [None]:
import os
import math
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F
from torch.optim import AdamW
from tqdm import tqdm
from pathlib import Path 


from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup


In [None]:
# Configuration
DATA_PATH = "./tig_dataset"
TOKENIZER_PATH = "./tokenizers/Tig_unigram_16000"
SAVE_PATH = "./saved_models/bert"

os.makedirs(SAVE_PATH, exist_ok=True)

In [None]:
from config import BERT_CONFIG

In [None]:

# Hyperparameters
BATCH_SIZE = 8
BLOCK_SIZE = 128             # max tokens per input
NUM_EPOCHS = 3
LR = 3e-5
WEIGHT_DECAY = 0.01
GRAD_ACCUM_STEPS = 2
LOG_INTERVAL = 50
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:

# 3. Tokenizer & Model
# ================================================
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer = BertTokenizer.from_pretrained(TOKENIZER_PATH)
BERT_CONFIG.num_labels = 2  # modify for your classification task

model = BertForSequenceClassification(BERT_CONFIG).to(DEVICE)
print("Tokenizer and model loaded ✅")

In [None]:
# 4. Dataset
# ================================================
class TextDataset(Dataset):
    """Custom dataset for BERT training."""
    def __init__(self, folder_path, tokenizer, block_size=128):
        self.file_paths = list(Path(folder_path).rglob("*.txt"))
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.samples = []
        # Pre-tokenize all files (optional for small datasets)
        for file_path in self.file_paths:
            text = Path(file_path).read_text(encoding="utf-8").strip()
            if not text:
                continue
            tokens = self.tokenizer(text, truncation=True, max_length=block_size, padding="max_length")
            self.samples.append((tokens['input_ids'], tokens['attention_mask'], 0))  # replace 0 with your label

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        input_ids, attention_mask, label = self.samples[idx]
        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(label, dtype=torch.long)
        }

train_dataset = TextDataset(DATA_PATH, tokenizer, block_size=BLOCK_SIZE)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
# 5. Optimizer & Scheduler
# ================================================
optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1*total_steps), num_training_steps=total_steps)


In [None]:
# 6. Training Loop
# ================================================
def train_model(model, dataloader, optimizer, scheduler, num_epochs, device):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for step, batch in enumerate(pbar):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / GRAD_ACCUM_STEPS
            loss.backward()

            if (step + 1) % GRAD_ACCUM_STEPS == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            running_loss += loss.item() * GRAD_ACCUM_STEPS
            if step % LOG_INTERVAL == 0 and step > 0:
                avg_loss = running_loss / LOG_INTERVAL
                pbar.set_postfix(loss=f"{avg_loss:.4f}", ppl=f"{math.exp(avg_loss):.2f}")
                running_loss = 0.0

        # Save checkpoint after each epoch
        ckpt_path = os.path.join(MODEL_OUTPUT, f"epoch_{epoch+1}.pt")
        torch.save(model.state_dict(), ckpt_path)
        print(f"✅ Saved checkpoint: {ckpt_path}")

train_model(model, train_loader, optimizer, scheduler, NUM_EPOCHS, DEVICE)

In [None]:
# 7. Evaluation
# ================================================
from sklearn.metrics import accuracy_score, f1_score, classification_report

def evaluate_model(model, dataloader, device):
    model.eval()
    preds, labels_list = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            pred = torch.argmax(logits, dim=-1)

            preds.extend(pred.cpu().tolist())
            labels_list.extend(labels.cpu().tolist())

    acc = accuracy_score(labels_list, preds)
    f1 = f1_score(labels_list, preds, average="weighted")
    print("=== Evaluation Results ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(classification_report(labels_list, preds))

# Use the same train_loader or a separate validation loader
evaluate_model(model, train_loader, DEVICE)
