In [None]:
# Mount into drive

from google.colab import drive

drive.mount("/content/drive")

%cd '/content/drive/MyDrive/final_project/'

!pip install -r requirements.txt

Mounted at /content/drive
/content/drive/MyDrive/final_project
Collecting torch==2.1.0 (from -r requirements.txt (line 1))
  Downloading torch-2.1.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchvision==0.16.0 (from -r requirements.txt (line 2))
  Downloading torchvision-0.16.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting torchaudio==2.1.0 (from -r requirements.txt (line 3))
  Downloading torchaudio-2.1.0-cp310-cp310-manylinux1_x86_64.whl.metadata (5.7 kB)
Collecting torchtext==0.16.0 (from -r requirements.txt (line 4))
  Downloading torchtext-0.16.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.5 kB)
Collecting spacy==3.7.2 (from -r requirements.txt (line 6))
  Downloading spacy-3.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting jupyter (from -r requirements.txt (line 10))
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.1.0->-r 

In [None]:
import os
from pathlib import Path
import re
import random
import transformers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.optim import Adam
from torchtext.vocab import vocab
import itertools
import math
import numpy as np
import pandas as pd
from collections import Counter
import typing
import time
from datetime import datetime
from sklearn.model_selection import train_test_split

# **Preprocess data (split data)**

In [None]:
# Step 1: Read data from train and test directories and save as text files
def process_and_save_txt_files(path_train, path_test, output_dir='./data01', chunk_size=10000):
    """
    Reads text data from train and test directories and saves it as text files for tokenizer training.

    Args:
        path_train (str): Path to the 'train' directory containing .txt files.
        path_test (str): Path to the 'test' directory containing .txt files.
        output_dir (str): Directory to save the text chunks.
        chunk_size (int): Number of samples per text file chunk.

    Returns:
        list: List of all text data collected from the .txt files.
    """
    os.makedirs(output_dir, exist_ok=True)
    text_data = []
    file_count = 0

    # Gather all text files from train and test directories
    for path in [path_train, path_test]:
        for txt_file in Path(path).rglob('*.txt'):
            with open(txt_file, "r", encoding="utf-8") as f:
                content = f.read().strip()  # Read and strip extra whitespace
                text_data.append(content)

                # Save text data in chunks of 10,000 samples
                if len(text_data) == chunk_size:
                    with open(f'{output_dir}/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
                        fp.write('\n'.join(text_data))
                    text_data = []
                    file_count += 1

    # Save remaining data
    if text_data:
        with open(f'{output_dir}/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))

    print(f"Text data saved to {output_dir}")

    return text_data

# Example paths for train and test directories
path_train = 'data/final/train/ects'
path_test = 'data/final/test/ects'


# Save the data from train and test directories as text files
text_data = process_and_save_txt_files(path_train, path_test)
# print(text_data)
# Step 2: Train WordPiece tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=True
)

tokenizer.train(
    files=[str(x) for x in Path('./data01').glob('**/*.txt')],
    vocab_size=30_000,
    min_frequency=5,
    limit_alphabet=1000,
    wordpieces_prefix='##',
    special_tokens=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]']
)

# Save the tokenizer model
os.makedirs('./bert-it-1', exist_ok=True)
tokenizer.save_model('./bert-it-1', 'bert-it')

# Load the tokenizer for further use
tokenizer = BertTokenizer.from_pretrained('./bert-it-1/bert-it-vocab.txt', local_files_only=True)

# Step 3: Prepare train and test data for model training
# Split the text data into train and test sets
train_texts, test_texts = train_test_split(text_data, test_size=0.2, random_state=42)


# Save the train and test data as CSV files
train_df = pd.DataFrame({'text': train_texts})
test_df = pd.DataFrame({'text': test_texts})

train_df.to_csv('data/train_data.csv', index=False, encoding='utf-8')
test_df.to_csv('data/test_data.csv', index=False, encoding='utf-8')

print("Training and test data saved as CSV files.")

Text data saved to ./data01




Training and test data saved as CSV files.


# **Preprocess data (for MLM and NSP)**

In [None]:
class BERTDataset(Dataset):
    def __init__(self, data_pair, tokenizer, seq_len=64):
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.lines = data_pair

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, item):
        # Get random sentence pair and next sentence label
        t1, t2, is_next_label = self._get_sentences(item)

        # Randomly mask some words in the sentences
        t1_random, t1_label = self._random_word(t1)
        t2_random, t2_label = self._random_word(t2)

        # Add [CLS], [SEP] and [PAD] tokens
        t1 = [self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]
        t2 = t2_random + [self.tokenizer.vocab['[SEP]']]
        t1_label = [self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]
        t2_label = t2_label + [self.tokenizer.vocab['[PAD]']]

        # Combine sentences and add padding
        segment_label = [1] * len(t1) + [2] * len(t2)
        bert_input = t1 + t2
        bert_label = t1_label + t2_label
        padding = [self.tokenizer.vocab['[PAD]']] * (self.seq_len - len(bert_input))

        bert_input.extend(padding)
        bert_label.extend(padding)
        segment_label = segment_label[:self.seq_len] + [0] * (self.seq_len - len(segment_label))

        # Return as tensor
        return {
            "bert_input": torch.tensor(bert_input),
            "bert_label": torch.tensor(bert_label),
            "segment_label": torch.tensor(segment_label),
            "is_next": torch.tensor(is_next_label)
        }

    def _random_word(self, sentence):
        tokens = sentence.split()
        output, output_label = [], []

        for token in tokens:
            prob = random.random()
            token_id = self.tokenizer.encode(token, add_special_tokens=False)

            if prob < 0.15:
                prob /= 0.15

                # 80% chance change token to mask token
                if prob < 0.8:
                    output.extend([self.tokenizer.vocab['[MASK]'] for _ in range(len(token_id))])

                # 10% chance change token to random token
                elif prob < 0.9:
                    output.extend([random.randrange(len(self.tokenizer.vocab)) for _ in range(len(token_id))])

                # 10% chance change token to current token
                else:
                    output.extend(token_id)

                output_label.extend(token_id)  # store the original token id for label

            else:
                output.extend(token_id)
                output_label.extend([0] * len(token_id))  # No replacement, label as 0

        # flattening
        output = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output]))
        output_label = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output_label]))
        assert len(output) == len(output_label)
        return output, output_label


    def _get_sentences(self, index):
        t1, t2 = self.lines[index][0], self.lines[index][1]
        # 50% chance for positive or negative pair
        if random.random() > 0.5:
            return t1, t2, 1
        else:
            return t1, self._get_random_line(), 0

    def _get_random_line(self):
        return self.lines[random.randrange(len(self.lines))][1]


# **Implement model (Embedding)**

In [None]:
class PositionalEmbedding(torch.nn.Module):
    def __init__(self, d_model, max_len=128):
        super().__init__()
        # Initialize positional encodings
        pe = torch.zeros(max_len, d_model).float()
        pe.requires_grad = False

        for pos in range(max_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** (2 * i / d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** (2 * (i + 1) / d_model)))

        # Add batch dimension
        self.register_buffer('pe', pe.unsqueeze(0))  # Use register_buffer to save on device

    def forward(self, x):
        # Ensure positional embeddings are on the same device as input
        return self.pe[:, :x.size(1), :].to(x.device)

class BERTEmbedding(nn.Module):
    """
    BERT Embedding includes:
    1. Token Embedding: Standard embedding matrix for tokens.
    2. Positional Embedding: Adds positional information via sine and cosine functions.
    3. Segment Embedding: Adds segment information (e.g., sentence A = 1, sentence B = 2).
    The sum of these embeddings is returned as the final output.
    """

    def __init__(self, vocab_size, embed_size, seq_len=64, dropout=0.1):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.segment_embedding = nn.Embedding(3, embed_size, padding_idx=0)
        self.position_embedding = PositionalEmbedding(d_model=embed_size, max_len=seq_len)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, sequence, segment_label):
        device = next(self.parameters()).device
        sequence = sequence.to(device)
        segment_label = segment_label.to(device)

        token_embeds = self.token_embedding(sequence)
        position_embeds = self.position_embedding(sequence)
        segment_embeds = self.segment_embedding(segment_label)

        embeddings = token_embeds + position_embeds + segment_embeds
        return self.dropout(embeddings)

# **Implement model (Attention mechanism)**

In [None]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, heads, d_model, dropout=0.1):
        super().__init__()

        assert d_model % heads == 0
        self.d_k = d_model // heads
        self.heads = heads
        self.dropout = nn.Dropout(dropout)

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.output_linear = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask):
        query, key, value = self.query(query), self.key(key), self.value(value)

        # Reshape to (batch_size, heads, max_len, d_k)
        query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        key = key.view(key.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)
        value = value.view(value.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)

        # Compute attention scores
        scores = torch.matmul(query, key.permute(0, 1, 3, 2)) / math.sqrt(query.size(-1))

        # Apply mask
        scores = scores.masked_fill(mask == 0, -1e9)

        # Attention weights and dropout
        weights = F.softmax(scores, dim=-1)
        weights = self.dropout(weights)

        # Apply attention weights to value
        context = torch.matmul(weights, value)

        # Reshape back to (batch_size, max_len, d_model)
        context = context.permute(0, 2, 1, 3).contiguous().view(context.shape[0], -1, self.heads * self.d_k)

        return self.output_linear(context)


class FeedForward(nn.Module):
    def __init__(self, d_model, middle_dim=2048, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(d_model, middle_dim)
        self.fc2 = nn.Linear(middle_dim, d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.GELU()

    def forward(self, x):
        x = self.activation(self.fc1(x))
        return self.fc2(self.dropout(x))


class EncoderLayer(nn.Module):
    def __init__(self, d_model=768, heads=12, feed_forward_hidden=768 * 4, dropout=0.1):
        super().__init__()

        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadedAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model, middle_dim=feed_forward_hidden)
        self.dropout = nn.Dropout(dropout)

    def forward(self, embeddings, mask):
        # Self-attention
        attended = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))
        attended = self.layernorm(attended + embeddings)

        # Feed-forward network
        ff_out = self.dropout(self.feed_forward(attended))
        return self.layernorm(ff_out + attended)

# **Implement model (MLM and NSP)**

In [None]:
class BERT(nn.Module):
    """
    BERT model: Bidirectional Encoder Representations from Transformers.
    """

    def __init__(self, vocab_size, d_model=768, n_layers=12, heads=12, dropout=0.1):
        """
        Initializes the BERT model.

        :param vocab_size: Total vocabulary size
        :param d_model: Hidden size of the BERT model
        :param n_layers: Number of transformer layers
        :param heads: Number of attention heads
        :param dropout: Dropout rate
        """
        super().__init__()
        self.d_model = d_model
        self.n_layers = n_layers

        # Feed-forward network hidden size is 4 times the model's hidden size
        self.feed_forward_hidden = d_model * 4

        # Embedding layer: sum of token, segment, and positional embeddings
        self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=d_model)

        # Multi-layer transformer blocks
        self.encoder_blocks = nn.ModuleList(
            [EncoderLayer(d_model, heads, self.feed_forward_hidden, dropout) for _ in range(n_layers)]
        )

    def forward(self, x, segment_info):
        # Attention mask for padded tokens
        mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)

        # Embed the input sequence
        x = self.embedding(x, segment_info)

        # Pass through multiple transformer layers
        for encoder in self.encoder_blocks:
            x = encoder(x, mask)
        return x


class NextSentencePrediction(nn.Module):
    """
    2-class classification model: predicts whether the second sentence follows the first.
    """

    def __init__(self, hidden_size):
        """
        Initializes the next sentence prediction model.

        :param hidden_size: The hidden size of the BERT model output
        """
        super().__init__()
        self.linear = nn.Linear(hidden_size, 2)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        # Use only the [CLS] token (first token) for classification
        return self.softmax(self.linear(x[:, 0]))


class MaskedLanguageModel(nn.Module):
    """
    Predicts the original token for each masked token in the input sequence.
    This is a multi-class classification problem where the number of classes is the vocabulary size.
    """

    def __init__(self, hidden_size, vocab_size):
        """
        Initializes the masked language model.

        :param hidden_size: The hidden size of the BERT model output
        :param vocab_size: The size of the vocabulary for the classification task
        """
        super().__init__()
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        return self.softmax(self.linear(x))


class BERTLM(nn.Module):
    """
    BERT Language Model combining Next Sentence Prediction (NSP) and Masked Language Modeling (MLM).
    """

    def __init__(self, bert: BERT, vocab_size):
        """
        Initializes the BERT Language Model with NSP and MLM.

        :param bert: The pre-trained BERT model
        :param vocab_size: The size of the vocabulary for the masked language model
        """
        super().__init__()
        self.bert = bert
        self.next_sentence = NextSentencePrediction(self.bert.d_model)
        self.mask_lm = MaskedLanguageModel(self.bert.d_model, vocab_size)

    def forward(self, x, segment_label):
        # Pass through the BERT model to obtain embeddings
        x = self.bert(x, segment_label)

        # Get outputs for both NSP and MLM tasks
        return self.next_sentence(x), self.mask_lm(x)

# **Train BERT model (warm-up)**

# **Train BERT model (use checkpoint to store best model/ weights)**

In [None]:
class ScheduledOptim():
    """A simple wrapper for learning rate scheduling."""

    def __init__(self, optimizer, d_model, n_warmup_steps):
        """
        Initializes the learning rate scheduler.

        :param optimizer: The optimizer for which learning rate is scheduled
        :param d_model: The dimension of the model (used to initialize learning rate)
        :param n_warmup_steps: Number of warm-up steps for learning rate scheduling
        """
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):
        """Step with the optimizer and update the learning rate."""
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        """Zero out the gradients in the optimizer."""
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        """Calculates the scaling factor for the learning rate."""
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps
        ])

    def _update_learning_rate(self):
        """Updates the learning rate based on the current step."""
        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        # Update the learning rate for each parameter group in the optimizer
        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

# **Train BERT model (use checkpoint to store best model/ weights)**

In [None]:
class BERTTrainer:
    def __init__(self, model, train_dataloader, test_dataloader=None, lr=1e-4, weight_decay=0.01,
                 betas=(0.9, 0.999), warmup_steps=10000, log_freq=10, checkpoint_dir=None, log_dir=None, device='cuda'):
        self.device = device
        self.model = model.to(device)
        self.train_data = train_dataloader
        self.test_data = test_dataloader
        self.checkpoint_dir = Path(checkpoint_dir) if checkpoint_dir else None
        self.log_dir = Path(log_dir) if log_dir else None
        self.writer = SummaryWriter(log_dir=str(self.log_dir)) if self.log_dir else None

        # Set up Adam optimizer with hyperparameters
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(self.optim, self.model.bert.d_model, warmup_steps)

        # Use Negative Log Likelihood loss for masked token prediction
        self.criterion = torch.nn.NLLLoss(ignore_index=0)
        self.log_freq = log_freq

        print(f"Total Parameters: {sum(p.nelement() for p in self.model.parameters())}")

    def train(self, epoch):
        self._iteration(epoch, self.train_data, train=True)

    def test(self, epoch):
        self._iteration(epoch, self.test_data, train=False)

    def _iteration(self, epoch, data_loader, train=True):
        avg_loss = 0.0
        total_correct = 0
        total_elements = 0
        mode = "train" if train else "test"

        # Set up progress bar
        data_iter = tqdm.tqdm(enumerate(data_loader), desc=f"EP_{mode}:{epoch}", total=len(data_loader),
                              bar_format="{l_bar}{r_bar}")

        for i, data in data_iter:
            # Move batch data to the specified device (GPU or CPU)
            data = {key: value.to(self.device) for key, value in data.items()}

            # Forward pass for next sentence prediction and masked language model
            next_sent_output, mask_lm_output = self.model(data["bert_input"], data["segment_label"])

            # Calculate losses
            next_loss = self.criterion(next_sent_output, data["is_next"])
            mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])

            # Total loss: sum of next sentence prediction and masked language model losses
            loss = next_loss + mask_loss

            # Backward pass and optimization (only in training)
            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()

            # Accuracy for next sentence prediction
            correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()
            avg_loss += loss.item()
            total_correct += correct
            total_elements += data["is_next"].nelement()

            # Log progress
            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_elements * 100,
                "loss": loss.item()
            }

            if self.writer and i % self.log_freq == 0:
                data_iter.write(str(post_fix))
                self.writer.add_scalar("Loss/Train", loss.item(), epoch * len(data_loader) + i)
                self.writer.add_scalar("Accuracy/Train", total_correct / total_elements * 100, epoch * len(data_loader) + i)

        # Print summary for the epoch
        print(f"EP{epoch}, {mode}: avg_loss={avg_loss / len(data_iter)}, total_acc={total_correct * 100.0 / total_elements}")

        if self.checkpoint_dir and (epoch % 1 == 0):  # save checkpoint every epoch
            self.save_checkpoint(epoch)

    def save_checkpoint(self, epoch):
        if not self.checkpoint_dir:
            return

        checkpoint_filename = f"bert_epoch{epoch}_{datetime.utcnow().timestamp():.0f}.pt"
        checkpoint_path = self.checkpoint_dir / checkpoint_filename

        torch.save({
            'epoch': epoch,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optim.state_dict(),
        }, checkpoint_path)

        print(f"Checkpoint saved: {checkpoint_path}")

    def load_checkpoint(self, checkpoint_path):
        print(f"Restoring model from {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optim.load_state_dict(checkpoint['optimizer_state_dict'])
        print("Model restored from checkpoint.")

# **The training records for BERT-Large model**

In [None]:
# Configuration
try:
    BASE_DIR = Path(__file__).resolve().parent
except NameError:
    # For interactive environments like Jupyter
    BASE_DIR = Path(os.getcwd())

CHECKPOINT_DIR = BASE_DIR.joinpath('data/bert_checkpoints')
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

timestamp = datetime.utcnow().timestamp()
LOG_DIR = BASE_DIR.joinpath(f'data/logs/bert_experiment_{timestamp}')
LOG_DIR.mkdir(parents=True, exist_ok=True)

MAX_LEN = 64
EPOCHS = 25
BATCH_SIZE = 32
LEARNING_RATE = 0.00001
WEIGHT_DECAY = 0.0001
BETAS = (0.9, 0.999)
WARMUP_STEPS = 4000

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.cuda.empty_cache()

if __name__ == '__main__':
    print("Preparing dataset...")

    train_data = BERTDataset(train_texts, seq_len=MAX_LEN, tokenizer=tokenizer)
    test_data = BERTDataset(test_texts, seq_len=MAX_LEN, tokenizer=tokenizer)
    train_loader = DataLoader(
        train_data, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True
    )
    test_loader = DataLoader(
        test_data, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True
    )


    print(f"Dataset loaded(train): {len(train_data)} samples.")
    print(f"Dataset loaded(test): {len(test_data)} samples.")

    print("Initializing model...")
    bert_model = BERT(
        vocab_size=len(tokenizer.vocab),
        d_model=1024,
        n_layers=24,
        heads=16,
        dropout=0.1
    )

    # Wrap the BERT model with language modeling components
    bert_lm = BERTLM(bert_model, len(tokenizer.vocab))
    bert_lm.to(device)

    # Initialize the trainer
    bert_trainer = BERTTrainer(
        model=bert_lm,
        train_dataloader=train_loader,
        test_dataloader=test_loader,
        lr=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        betas=BETAS,
        warmup_steps=WARMUP_STEPS,
        log_freq=10,
        device=device
    )

    print("Starting training...")
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}/{EPOCHS}")

        # Train
        bert_trainer.train(epoch)

        # Test
        print("Evaluating on test data...")
        bert_trainer.test(epoch)

        # Save checkpoint after each epoch
        checkpoint_path = CHECKPOINT_DIR.joinpath(f"bert_checkpoint_epoch_{epoch + 1}.pt")
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': bert_lm.state_dict(),
            'optimizer_state_dict': bert_trainer.optim.state_dict(),
        }, checkpoint_path)
        print(f"Checkpoint saved: {checkpoint_path}")

    print("Training complete.")


Preparing dataset...
Dataset loaded(train): 1740 samples.
Dataset loaded(test): 436 samples.
Initializing model...
Total Parameters: 351476179
Starting training...
Epoch 1/25


EP_train:0: 100%|| 55/55 [01:00<00:00,  1.11s/it]


EP0, train: avg_loss=8.00240068435669, total_acc=48.67816091954023
Evaluating on test data...


EP_test:0: 100%|| 14/14 [00:05<00:00,  2.53it/s]


EP0, test: avg_loss=6.012485810688564, total_acc=48.1651376146789
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_1.pt
Epoch 2/25


EP_train:1: 100%|| 55/55 [01:02<00:00,  1.14s/it]


EP1, train: avg_loss=5.145532950488004, total_acc=50.804597701149426
Evaluating on test data...


EP_test:1: 100%|| 14/14 [00:05<00:00,  2.37it/s]


EP1, test: avg_loss=4.4224024670464654, total_acc=50.91743119266055
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_2.pt
Epoch 3/25


EP_train:2: 100%|| 55/55 [01:04<00:00,  1.18s/it]


EP2, train: avg_loss=4.081280127438632, total_acc=47.35632183908046
Evaluating on test data...


EP_test:2: 100%|| 14/14 [00:06<00:00,  2.30it/s]


EP2, test: avg_loss=3.8755704164505005, total_acc=49.31192660550459
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_3.pt
Epoch 4/25


EP_train:3: 100%|| 55/55 [01:05<00:00,  1.19s/it]


EP3, train: avg_loss=3.5977952350269664, total_acc=51.724137931034484
Evaluating on test data...


EP_test:3: 100%|| 14/14 [00:06<00:00,  2.18it/s]


EP3, test: avg_loss=3.018637844494411, total_acc=47.018348623853214
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_4.pt
Epoch 5/25


EP_train:4: 100%|| 55/55 [01:06<00:00,  1.21s/it]


EP4, train: avg_loss=nan, total_acc=49.08045977011494
Evaluating on test data...


EP_test:4: 100%|| 14/14 [00:06<00:00,  2.23it/s]


EP4, test: avg_loss=2.7191762157848904, total_acc=52.293577981651374
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_5.pt
Epoch 6/25


EP_train:5: 100%|| 55/55 [01:06<00:00,  1.21s/it]


EP5, train: avg_loss=2.7386115529320456, total_acc=51.55172413793103
Evaluating on test data...


EP_test:5: 100%|| 14/14 [00:06<00:00,  2.26it/s]


EP5, test: avg_loss=2.75054863521031, total_acc=47.018348623853214
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_6.pt
Epoch 7/25


EP_train:6: 100%|| 55/55 [01:06<00:00,  1.21s/it]


EP6, train: avg_loss=2.7702351223338733, total_acc=48.160919540229884
Evaluating on test data...


EP_test:6: 100%|| 14/14 [00:06<00:00,  2.26it/s]


EP6, test: avg_loss=2.918459517615182, total_acc=51.60550458715596
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_7.pt
Epoch 8/25


EP_train:7: 100%|| 55/55 [01:05<00:00,  1.20s/it]


EP7, train: avg_loss=2.6796715172854335, total_acc=53.333333333333336
Evaluating on test data...


EP_test:7: 100%|| 14/14 [00:06<00:00,  2.23it/s]


EP7, test: avg_loss=2.48905086517334, total_acc=48.85321100917431
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_8.pt
Epoch 9/25


EP_train:8: 100%|| 55/55 [01:06<00:00,  1.20s/it]


EP8, train: avg_loss=2.5668770616704766, total_acc=50.86206896551724
Evaluating on test data...


EP_test:8: 100%|| 14/14 [00:06<00:00,  2.28it/s]


EP8, test: avg_loss=2.7404660752841403, total_acc=51.60550458715596
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_9.pt
Epoch 10/25


EP_train:9: 100%|| 55/55 [01:06<00:00,  1.20s/it]


EP9, train: avg_loss=2.6264140367507935, total_acc=49.94252873563219
Evaluating on test data...


EP_test:9: 100%|| 14/14 [00:06<00:00,  2.27it/s]


EP9, test: avg_loss=2.5108004212379456, total_acc=49.31192660550459
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_10.pt
Epoch 11/25


EP_train:10: 100%|| 55/55 [01:06<00:00,  1.20s/it]


EP10, train: avg_loss=2.592329257184809, total_acc=48.04597701149425
Evaluating on test data...


EP_test:10: 100%|| 14/14 [00:06<00:00,  2.28it/s]


EP10, test: avg_loss=2.56490319115775, total_acc=46.330275229357795
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_11.pt
Epoch 12/25


EP_train:11: 100%|| 55/55 [01:06<00:00,  1.20s/it]


EP11, train: avg_loss=2.6177465850656683, total_acc=51.89655172413793
Evaluating on test data...


EP_test:11: 100%|| 14/14 [00:06<00:00,  2.28it/s]


EP11, test: avg_loss=2.4651074154036388, total_acc=47.018348623853214
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_12.pt
Epoch 13/25


EP_train:12: 100%|| 55/55 [01:06<00:00,  1.21s/it]


EP12, train: avg_loss=2.6072197372263126, total_acc=50.3448275862069
Evaluating on test data...


EP_test:12: 100%|| 14/14 [00:06<00:00,  2.28it/s]


EP12, test: avg_loss=2.487783508641379, total_acc=52.293577981651374
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_13.pt
Epoch 14/25


EP_train:13: 100%|| 55/55 [01:06<00:00,  1.20s/it]


EP13, train: avg_loss=3.0318803136998955, total_acc=52.58620689655172
Evaluating on test data...


EP_test:13: 100%|| 14/14 [00:06<00:00,  2.23it/s]


EP13, test: avg_loss=3.1116199663707187, total_acc=49.54128440366973
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_14.pt
Epoch 15/25


EP_train:14: 100%|| 55/55 [01:05<00:00,  1.20s/it]


EP14, train: avg_loss=2.9767525326121937, total_acc=50.229885057471265
Evaluating on test data...


EP_test:14: 100%|| 14/14 [00:06<00:00,  2.28it/s]


EP14, test: avg_loss=3.028103862489973, total_acc=53.669724770642205
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_15.pt
Epoch 16/25


EP_train:15: 100%|| 55/55 [01:05<00:00,  1.20s/it]


EP15, train: avg_loss=3.0352279099551116, total_acc=48.5632183908046
Evaluating on test data...


EP_test:15: 100%|| 14/14 [00:06<00:00,  2.29it/s]


EP15, test: avg_loss=2.944752642086574, total_acc=46.330275229357795
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_16.pt
Epoch 17/25


EP_train:16: 100%|| 55/55 [01:06<00:00,  1.20s/it]


EP16, train: avg_loss=2.943457026915117, total_acc=51.03448275862069
Evaluating on test data...


EP_test:16: 100%|| 14/14 [00:06<00:00,  2.27it/s]


EP16, test: avg_loss=2.938149298940386, total_acc=45.642201834862384
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_17.pt
Epoch 18/25


EP_train:17: 100%|| 55/55 [01:05<00:00,  1.20s/it]


EP17, train: avg_loss=2.9085024096749046, total_acc=48.04597701149425
Evaluating on test data...


EP_test:17: 100%|| 14/14 [00:06<00:00,  2.29it/s]


EP17, test: avg_loss=2.978936570031302, total_acc=48.85321100917431
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_18.pt
Epoch 19/25


EP_train:18: 100%|| 55/55 [01:05<00:00,  1.20s/it]


EP18, train: avg_loss=3.0207162727009167, total_acc=49.02298850574713
Evaluating on test data...


EP_test:18: 100%|| 14/14 [00:06<00:00,  2.30it/s]


EP18, test: avg_loss=2.849725638117109, total_acc=50.45871559633027
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_19.pt
Epoch 20/25


EP_train:19: 100%|| 55/55 [01:05<00:00,  1.19s/it]


EP19, train: avg_loss=2.943289635398171, total_acc=50.632183908045974
Evaluating on test data...


EP_test:19: 100%|| 14/14 [00:06<00:00,  2.23it/s]


EP19, test: avg_loss=3.094244803701128, total_acc=50.91743119266055
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_20.pt
Epoch 21/25


EP_train:20: 100%|| 55/55 [01:06<00:00,  1.21s/it]


EP20, train: avg_loss=2.898504801229997, total_acc=48.793103448275865
Evaluating on test data...


EP_test:20: 100%|| 14/14 [00:06<00:00,  2.20it/s]


EP20, test: avg_loss=2.9679926804133823, total_acc=46.10091743119266
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_21.pt
Epoch 22/25


EP_train:21: 100%|| 55/55 [01:07<00:00,  1.22s/it]


EP21, train: avg_loss=3.003139517524026, total_acc=48.67816091954023
Evaluating on test data...


EP_test:21: 100%|| 14/14 [00:06<00:00,  2.09it/s]


EP21, test: avg_loss=2.8486708062035695, total_acc=49.31192660550459
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_22.pt
Epoch 23/25


EP_train:22: 100%|| 55/55 [01:07<00:00,  1.22s/it]


EP22, train: avg_loss=2.992994512211193, total_acc=49.42528735632184
Evaluating on test data...


EP_test:22: 100%|| 14/14 [00:06<00:00,  2.25it/s]


EP22, test: avg_loss=2.893943565232413, total_acc=50.0
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_23.pt
Epoch 24/25


EP_train:23: 100%|| 55/55 [01:07<00:00,  1.23s/it]


EP23, train: avg_loss=2.9158073035153476, total_acc=49.59770114942529
Evaluating on test data...


EP_test:23: 100%|| 14/14 [00:06<00:00,  2.22it/s]


EP23, test: avg_loss=3.053294931139265, total_acc=54.58715596330275
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_24.pt
Epoch 25/25


EP_train:24: 100%|| 55/55 [01:07<00:00,  1.22s/it]


EP24, train: avg_loss=nan, total_acc=49.71264367816092
Evaluating on test data...


EP_test:24: 100%|| 14/14 [00:06<00:00,  2.27it/s]


EP24, test: avg_loss=2.9461527381624495, total_acc=44.95412844036697
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_25.pt
Training complete.


# **The training records for BERT-Base model**

In [None]:
# Configuration
try:
    BASE_DIR = Path(__file__).resolve().parent
except NameError:
    # For interactive environments like Jupyter
    BASE_DIR = Path(os.getcwd())

CHECKPOINT_DIR = BASE_DIR.joinpath('data/bert_checkpoints')
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

timestamp = datetime.utcnow().timestamp()
LOG_DIR = BASE_DIR.joinpath(f'data/logs/bert_experiment_{timestamp}')
LOG_DIR.mkdir(parents=True, exist_ok=True)

MAX_LEN = 64
EPOCHS = 20
BATCH_SIZE = 32
LEARNING_RATE = 0.0001
WEIGHT_DECAY = 0.01
BETAS = (0.9, 0.999)
WARMUP_STEPS = 10000

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.cuda.empty_cache()

if __name__ == '__main__':
    print("Preparing dataset...")

    train_data = BERTDataset(df, seq_len=MAX_LEN, tokenizer=tokenizer)
    test_data = BERTDataset(df, seq_len=MAX_LEN, tokenizer=tokenizer)
    train_loader = DataLoader(
        train_data, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True
    )

    print(f"Dataset loaded: {len(train_data)} samples.")

    print("Initializing model...")
    bert_model = BERT(
        vocab_size=len(tokenizer.vocab),
        d_model=768,
        n_layers=2,
        heads=12,
        dropout=0.2
    )

    # Wrap the BERT model with language modeling components
    bert_lm = BERTLM(bert_model, len(tokenizer.vocab))
    bert_lm.to(device)

    # Initialize the trainer
    bert_trainer = BERTTrainer(
        model=bert_lm,
        train_dataloader=train_loader,
        lr=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        betas=BETAS,
        warmup_steps=WARMUP_STEPS,
        log_freq=10,
        device=device
    )

    print("Starting training...")
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}/{EPOCHS}")
        bert_trainer.train(epoch)

        # Save checkpoint after each epoch
        checkpoint_path = CHECKPOINT_DIR.joinpath(f"bert_checkpoint_epoch_{epoch + 1}.pt")
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': bert_lm.state_dict(),
            'optimizer_state_dict': bert_trainer.optim.state_dict(),
        }, checkpoint_path)
        print(f"Checkpoint saved: {checkpoint_path}")

    print("Training complete.")



Preparing dataset...
Dataset loaded: 2176 samples.
Initializing model...
Total Parameters: 14273345
Starting training...
Epoch 1/20


EP_train:0: 100%|| 68/68 [00:04<00:00, 14.72it/s]


EP0, train: avg_loss=4.75022161357543, total_acc=49.26470588235294
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_1.pt
Epoch 2/20


EP_train:1: 100%|| 68/68 [00:04<00:00, 16.00it/s]


EP1, train: avg_loss=3.8224641259978798, total_acc=51.0110294117647
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_2.pt
Epoch 3/20


EP_train:2: 100%|| 68/68 [00:04<00:00, 16.94it/s]


EP2, train: avg_loss=3.1147766674266144, total_acc=48.713235294117645
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_3.pt
Epoch 4/20


EP_train:3: 100%|| 68/68 [00:05<00:00, 12.30it/s]


EP3, train: avg_loss=2.867560814408695, total_acc=49.4485294117647
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_4.pt
Epoch 5/20


EP_train:4: 100%|| 68/68 [00:04<00:00, 13.90it/s]


EP4, train: avg_loss=2.6595516695695767, total_acc=47.748161764705884
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_5.pt
Epoch 6/20


EP_train:5: 100%|| 68/68 [00:04<00:00, 15.86it/s]


EP5, train: avg_loss=2.591632083934896, total_acc=51.424632352941174
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_6.pt
Epoch 7/20


EP_train:6: 100%|| 68/68 [00:04<00:00, 14.89it/s]


EP6, train: avg_loss=2.5894522316315594, total_acc=51.88419117647059
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_7.pt
Epoch 8/20


EP_train:7: 100%|| 68/68 [00:04<00:00, 15.74it/s]


EP7, train: avg_loss=2.408834546804428, total_acc=50.32169117647059
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_8.pt
Epoch 9/20


EP_train:8: 100%|| 68/68 [00:04<00:00, 15.82it/s]


EP8, train: avg_loss=2.3865403515451096, total_acc=48.943014705882355
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_9.pt
Epoch 10/20


EP_train:9: 100%|| 68/68 [00:04<00:00, 14.22it/s]


EP9, train: avg_loss=2.478575473322588, total_acc=50.275735294117645
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_10.pt
Epoch 11/20


EP_train:10: 100%|| 68/68 [00:04<00:00, 15.82it/s]


EP10, train: avg_loss=2.398171454668045, total_acc=48.2077205882353
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_11.pt
Epoch 12/20


EP_train:11: 100%|| 68/68 [00:04<00:00, 15.98it/s]


EP11, train: avg_loss=2.320494572905933, total_acc=49.54044117647059
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_12.pt
Epoch 13/20


EP_train:12: 100%|| 68/68 [00:05<00:00, 13.38it/s]


EP12, train: avg_loss=2.393186472794589, total_acc=49.67830882352941
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_13.pt
Epoch 14/20


EP_train:13: 100%|| 68/68 [00:04<00:00, 16.26it/s]


EP13, train: avg_loss=2.311805404284421, total_acc=48.713235294117645
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_14.pt
Epoch 15/20


EP_train:14: 100%|| 68/68 [00:04<00:00, 15.96it/s]


EP14, train: avg_loss=2.3287406441043403, total_acc=49.356617647058826
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_15.pt
Epoch 16/20


EP_train:15: 100%|| 68/68 [00:04<00:00, 13.81it/s]


EP15, train: avg_loss=2.387318514725741, total_acc=50.82720588235294
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_16.pt
Epoch 17/20


EP_train:16: 100%|| 68/68 [00:04<00:00, 16.18it/s]


EP16, train: avg_loss=2.399014848120072, total_acc=49.17279411764706
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_17.pt
Epoch 18/20


EP_train:17: 100%|| 68/68 [00:04<00:00, 15.99it/s]


EP17, train: avg_loss=2.3506765768808475, total_acc=50.18382352941177
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_18.pt
Epoch 19/20


EP_train:18: 100%|| 68/68 [00:04<00:00, 13.73it/s]


EP18, train: avg_loss=2.3088366634705486, total_acc=50.78125
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_19.pt
Epoch 20/20


EP_train:19: 100%|| 68/68 [00:04<00:00, 15.88it/s]


EP19, train: avg_loss=2.3829231490107143, total_acc=50.82720588235294
Checkpoint saved: /content/drive/MyDrive/final_project/data/bert_checkpoints/bert_checkpoint_epoch_20.pt
Training complete.
