In [None]:
from google.colab import drive
drive.mount('/content/drive')

!mkdir -p /content/drive/MyDrive/email_bert
!mkdir -p /content/drive/MyDrive/email_bert/datasets
!mkdir -p /content/drive/MyDrive/email_bert/data
!mkdir -p /content/drive/MyDrive/email_bert/tokenizer
!mkdir -p /content/drive/MyDrive/email_bert/checkpoints

!pip install transformers datasets tokenizers pandas
# (you don't actually need `email-parser` – you're using Python's built-in `email` module)


Mounted at /content/drive


In [None]:

# Optional: keep a backup in Drive
!cp  /content/drive/MyDrive/email_bert/datasets/enron_mail_20150507.tar.gz /content/enron_mail_20150507.tar.gz
!tar -xzf /content/enron_mail_20150507.tar.gz -C /content/

In [None]:

# Extract the downloaded tarball
enron_dir = '/content/maildir/'


# Data processing: Process email data instead of movie dialogues
import os
import email
import pandas as pd
from pathlib import Path
import torch
import re
import random
import transformers, datasets
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
import tqdm
from torch.utils.data import Dataset, DataLoader
import itertools
import math
import torch.nn.functional as F
import numpy as np
from torch.optim import Adam
from email.parser import Parser
import json

def extract_email_content(email_path):
    """Extract content from an email file"""
    try:
        with open(email_path, 'r', encoding='latin1') as f:
            content = f.read()

        # Parse email content
        msg = email.message_from_string(content)

        # Get subject and body
        subject = msg.get('Subject', '')

        body = ""
        if msg.is_multipart():
            for part in msg.walk():
                ctype = part.get_content_type()
                cdispo = str(part.get('Content-Disposition'))

                # Skip attachments
                if ctype == 'text/plain' and 'attachment' not in cdispo:
                    part_body = part.get_payload(decode=True)
                    if part_body:
                        body = part_body.decode('latin1', errors='ignore')
                    break
        else:
            part_body = msg.get_payload(decode=True)
            if part_body:
                body = part_body.decode('latin1', errors='ignore')

        # Clean text - remove excessive whitespace
        body = re.sub(r'\s+', ' ', body).strip() if body else ""
        subject = re.sub(r'\s+', ' ', subject).strip()

        return subject, body
    except Exception as e:
        return "", ""

# Process Enron emails
print("Processing Enron emails...")
MAX_LEN = 128
#enron_dir = '/content/drive/MyDrive/email_bert/datasets/enron_mail_20150507/maildir/'
email_pairs = []

# Limit to a sample to avoid processing too many emails
users = os.listdir(enron_dir)[:30]  # Take first 30 users

for user in users:
    user_dir = os.path.join(enron_dir, user)
    if os.path.isdir(user_dir):
        for folder in os.listdir(user_dir)[:5]:  # Limit folders per user
            folder_path = os.path.join(user_dir, folder)
            if os.path.isdir(folder_path):
                for file in os.listdir(folder_path)[:50]:  # Limit files per folder
                    file_path = os.path.join(folder_path, file)
                    if os.path.isfile(file_path):
                        subject, body = extract_email_content(file_path)

                        # Skip empty emails
                        if not body:
                            continue

                        # Create sentence pairs for NSP
                        # 1. Split email body into paragraphs
                        paragraphs = body.split('\n')
                        paragraphs = [p.strip() for p in paragraphs if p.strip()]

                        if len(paragraphs) < 2:
                            # If no clear paragraphs, split by sentences
                            sentences = re.split(r'[.!?]+', body)
                            sentences = [s.strip() for s in sentences if len(s.strip()) > 20]

                            if len(sentences) >= 2:
                                # Create pairs from consecutive sentences
                                for i in range(len(sentences) - 1):
                                    # Positive example: consecutive sentences
                                    first = sentences[i]
                                    second = sentences[i + 1]

                                    if len(first.split()) > 3 and len(second.split()) > 3:
                                        email_pairs.append([
                                            ' '.join(first.split()[:MAX_LEN]),
                                            ' '.join(second.split()[:MAX_LEN])
                                        ])
                        else:
                            # Create pairs from paragraphs
                            for i in range(len(paragraphs) - 1):
                                # Positive example: consecutive paragraphs
                                first = paragraphs[i]
                                second = paragraphs[i + 1]

                                if len(first.split()) > 3 and len(second.split()) > 3:
                                    email_pairs.append([
                                        ' '.join(first.split()[:MAX_LEN]),
                                        ' '.join(second.split()[:MAX_LEN])
                                    ])

print(f"Created {len(email_pairs)} email pairs")
# Sample one pair to verify
print("Sample pair:", email_pairs[20] if len(email_pairs) > 20 else email_pairs[0])

# Save pairs to Drive for reuse
import pickle
with open('/content/drive/MyDrive/email_bert/datasets/email_pairs.pkl', 'wb') as f:
    pickle.dump(email_pairs, f)

# Save data as txt file for tokenizer training
text_data = []
file_count = 0

for sample in tqdm.tqdm([x[0] for x in email_pairs] + [x[1] for x in email_pairs]):
    text_data.append(sample)

    # Save to file once we hit the 10K mark
    if len(text_data) == 10000:
        with open(f'/content/drive/MyDrive/email_bert/data/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1

# Save remaining data
if text_data:
    with open(f'/content/drive/MyDrive/email_bert/data/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
        fp.write('\n'.join(text_data))

paths = [str(x) for x in Path('/content/drive/MyDrive/email_bert/data').glob('**/*.txt')]
print(f"Created {len(paths)} text files for tokenizer training")

# Train tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=True
)

tokenizer.train(
    files=paths,
    vocab_size=30_000,
    min_frequency=2,
    limit_alphabet=1000,
    wordpieces_prefix='##',
    special_tokens=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]']
)

# Save tokenizer to Drive
# Save tokenizer
tokenizer.save_model('/content/drive/MyDrive/email_bert/tokenizer', 'bert-email')

# Load tokenizer (CORRECT way)
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast(
    vocab_file="/content/drive/MyDrive/email_bert/tokenizer/bert-email-vocab.txt",
    lowercase=True,
    strip_accents=False
)

# Test tokenizer
sample_text = "This is an email about the project deadline tomorrow."
token_ids = tokenizer(sample_text)['input_ids']
print("Sample tokenization:", tokenizer.convert_ids_to_tokens(token_ids))


import os
import math
import random
import itertools
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import tqdm
import wandb

# =====================================
# 1) BERTDataset (same interface as before)
# =====================================

class BERTDataset(Dataset):
    def __init__(self, data_pair, tokenizer, seq_len=256):
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.corpus_lines = len(data_pair)
        self.lines = data_pair

    def __len__(self):
        return self.corpus_lines

    def __getitem__(self, item):
        # Step 1: get email pair, either negative or positive (for NSP)
        t1, t2, is_next_label = self.get_sent(item)

        # Step 2: replace random words with mask / random words (for MLM)
        t1_random, t1_label = self.random_word(t1)
        t2_random, t2_label = self.random_word(t2)

        # Step 3: Add special tokens (CLS, SEP)
        cls_id = self.tokenizer.vocab['[CLS]']
        sep_id = self.tokenizer.vocab['[SEP]']
        pad_id = self.tokenizer.vocab['[PAD]']

        t1 = [cls_id] + t1_random + [sep_id]
        t2 = t2_random + [sep_id]

        t1_label = [pad_id] + t1_label + [pad_id]
        t2_label = t2_label + [pad_id]

        # Step 4: combine parts and add padding
        segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
        bert_input = (t1 + t2)[:self.seq_len]
        bert_label = (t1_label + t2_label)[:self.seq_len]

        padding_len = self.seq_len - len(bert_input)
        if padding_len > 0:
            padding = [pad_id] * padding_len
            bert_input.extend(padding)
            bert_label.extend(padding)
            segment_label.extend(padding)

        output = {
            "bert_input": bert_input,
            "bert_label": bert_label,
            "segment_label": segment_label,
            "is_next": is_next_label
        }

        return {key: torch.tensor(value) for key, value in output.items()}

    def random_word(self, sentence):
        """
        Very similar to your original logic:
        - 15% of tokens are selected for MLM
        - 80% -> [MASK]
        - 10% -> random token
        - 10% -> leave as is
        Labels:
        - token id where we predict
        - 0 where we don't predict (ignore_index=0 in MLM loss)
        """
        tokens = sentence.split()
        output_label = []
        output = []

        for token in tokens:
            prob = random.random()

            token_ids = self.tokenizer(token)['input_ids']
            # token_ids: [CLS, ..., SEP] or just [CLS, SEP] if weird
            if len(token_ids) <= 2:
                continue

            token_ids = token_ids[1:-1]  # remove special tokens

            if prob < 0.15:
                prob /= 0.15

                # 80%: replace with [MASK]
                if prob < 0.8:
                    for _ in range(len(token_ids)):
                        output.append(self.tokenizer.vocab['[MASK]'])
                # 10%: random token
                elif prob < 0.9:
                    for _ in range(len(token_ids)):
                        output.append(random.randrange(len(self.tokenizer.vocab)))
                # 10%: keep original
                else:
                    output.extend(token_ids)

                output_label.extend(token_ids)
            else:
                # no prediction for this token
                output.extend(token_ids)
                output_label.extend([0] * len(token_ids))

        # Already flat lists, but keep this in case
        output = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output]))
        output_label = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output_label]))

        return output, output_label

    def get_sent(self, index):
        """Return email segment pair - either positive or negative example for NSP."""
        t1, t2 = self.get_corpus_line(index)

        # 50% chance for positive / negative
        if random.random() > 0.5:
            return t1, t2, 1  # Positive example
        else:
            return t1, self.get_random_line(), 0  # Negative example

    def get_corpus_line(self, item):
        """Return segment pair from corpus."""
        return self.lines[item][0], self.lines[item][1]

    def get_random_line(self):
        """Return random segment."""
        i = random.randrange(len(self.lines))
        return self.lines[i][random.choice([0, 1])]


# ==========================
# 2) Embeddings & Encoder
# ==========================

class PositionalEmbedding(torch.nn.Module):
    def __init__(self, d_model, max_len=128):
        super().__init__()
        pe = torch.zeros(max_len, d_model).float()
        pe.requires_grad = False

        for pos in range(max_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                if i + 1 < d_model:
                    pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))

        # shape: (1, max_len, d_model)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        # x: (batch_size, seq_len)
        seq_len = x.size(1)
        # Broadcast along batch dimension automatically
        return self.pe[:, :seq_len, :]


class BERTEmbedding(torch.nn.Module):
    """
    BERT Embedding:
      - TokenEmbedding
      - SegmentEmbedding
      - PositionalEmbedding
    """

    def __init__(self, vocab_size, embed_size, seq_len=256, dropout=0.1):
        super().__init__()
        self.embed_size = embed_size
        self.token = torch.nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.segment = torch.nn.Embedding(3, embed_size, padding_idx=0)
        self.position = PositionalEmbedding(d_model=embed_size, max_len=seq_len)
        self.dropout = torch.nn.Dropout(p=dropout)

    def forward(self, sequence, segment_label):
        # sequence, segment_label: (batch_size, seq_len)
        positions = self.position(sequence)         # (1, seq_len, d_model) -> broadcast
        x = self.token(sequence) + positions + self.segment(segment_label)
        return self.dropout(x)


class MultiHeadedAttention(torch.nn.Module):
    def __init__(self, heads, d_model, dropout=0.1):
        super().__init__()
        assert d_model % heads == 0
        self.d_k = d_model // heads
        self.heads = heads
        self.dropout = torch.nn.Dropout(dropout)

        self.query = torch.nn.Linear(d_model, d_model)
        self.key = torch.nn.Linear(d_model, d_model)
        self.value = torch.nn.Linear(d_model, d_model)
        self.output_linear = torch.nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask):
        # query/key/value: (batch_size, seq_len, d_model)
        query = self.query(query)
        key = self.key(key)
        value = self.value(value)

        # (batch_size, seq_len, d_model) -> (batch_size, heads, seq_len, d_k)
        def split_heads(x):
            return x.view(x.size(0), -1, self.heads, self.d_k).permute(0, 2, 1, 3)

        query = split_heads(query)
        key = split_heads(key)
        value = split_heads(value)

        # scores: (batch_size, heads, seq_len, seq_len)
        scores = torch.matmul(query, key.permute(0, 1, 3, 2)) / math.sqrt(self.d_k)
        scores = scores.masked_fill(mask == 0, -1e9)

        weights = F.softmax(scores, dim=-1)
        weights = self.dropout(weights)

        # context: (batch_size, heads, seq_len, d_k)
        context = torch.matmul(weights, value)

        # -> (batch_size, seq_len, d_model)
        context = context.permute(0, 2, 1, 3).contiguous()
        context = context.view(context.size(0), -1, self.heads * self.d_k)

        return self.output_linear(context)


class FeedForward(torch.nn.Module):
    "Implements FFN: Linear -> GELU -> Dropout -> Linear"

    def __init__(self, d_model, middle_dim=2048, dropout=0.1):
        super().__init__()
        self.fc1 = torch.nn.Linear(d_model, middle_dim)
        self.fc2 = torch.nn.Linear(middle_dim, d_model)
        self.dropout = torch.nn.Dropout(dropout)
        self.activation = torch.nn.GELU()

    def forward(self, x):
        out = self.activation(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out


class EncoderLayer(torch.nn.Module):
    def __init__(
        self,
        d_model=768,
        heads=12,
        feed_forward_hidden=768 * 4,
        dropout=0.1
    ):
        super().__init__()
        self.layernorm1 = torch.nn.LayerNorm(d_model)
        self.layernorm2 = torch.nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadedAttention(heads, d_model, dropout=dropout)
        self.feed_forward = FeedForward(d_model, middle_dim=feed_forward_hidden, dropout=dropout)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, embeddings, mask):
        # embeddings: (batch_size, seq_len, d_model)
        # mask:       (batch_size, 1, 1, seq_len)

        attended = self.self_multihead(embeddings, embeddings, embeddings, mask)
        attended = self.dropout(attended)
        out1 = self.layernorm1(attended + embeddings)

        ff = self.feed_forward(out1)
        ff = self.dropout(ff)
        out2 = self.layernorm2(ff + out1)

        return out2


# ==========================
# 3) BERT backbone
# ==========================

class BERT(torch.nn.Module):
    """
    BERT model (encoder only).
    """

    def __init__(self, vocab_size, d_model=768, n_layers=10, heads=12, dropout=0.1, seq_len=64):
        super().__init__()
        self.d_model = d_model
        self.n_layers = n_layers
        self.heads = heads

        # Slightly smaller FFN: 3 * d_model (instead of 4 * d_model)
        self.feed_forward_hidden = d_model * 3

        self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=d_model, seq_len=seq_len, dropout=dropout)

        self.encoder_blocks = torch.nn.ModuleList(
            [
                EncoderLayer(
                    d_model=d_model,
                    heads=heads,
                    feed_forward_hidden=self.feed_forward_hidden,
                    dropout=dropout
                )
                for _ in range(n_layers)
            ]
        )

    def forward(self, x, segment_info):
        # x: (batch_size, seq_len)
        # segment_info: (batch_size, seq_len)

        # mask: (batch_size, 1, 1, seq_len)
        mask = (x > 0).unsqueeze(1).unsqueeze(2)

        # embedding: (batch_size, seq_len, d_model)
        x = self.embedding(x, segment_info)

        for encoder in self.encoder_blocks:
            x = encoder(x, mask)

        return x


# ==========================
# 4) Heads: NSP + MLM + BERTLM
# ==========================

class NextSentencePrediction(torch.nn.Module):
    def __init__(self, hidden):
        super().__init__()
        self.linear = torch.nn.Linear(hidden, 2)
        self.softmax = torch.nn.LogSoftmax(dim=-1)

    def forward(self, x):
        # use [CLS] token embedding
        return self.softmax(self.linear(x[:, 0]))


class MaskedLanguageModel(torch.nn.Module):
    def __init__(self, hidden, vocab_size):
        super().__init__()
        self.linear = torch.nn.Linear(hidden, vocab_size)
        self.softmax = torch.nn.LogSoftmax(dim=-1)

    def forward(self, x):
        # x: (batch_size, seq_len, hidden)
        return self.softmax(self.linear(x))


class BERTLM(torch.nn.Module):
    """
    Joint model: BERT backbone + NSP head + MLM head
    """

    def __init__(self, bert: BERT, vocab_size):
        super().__init__()
        self.bert = bert
        self.next_sentence = NextSentencePrediction(self.bert.d_model)
        self.mask_lm = MaskedLanguageModel(self.bert.d_model, vocab_size)

    def forward(self, x, segment_label):
        x = self.bert(x, segment_label)
        return self.next_sentence(x), self.mask_lm(x)


# ==========================
# 5) Optimizer + Trainer
# ==========================

class ScheduledOptim:
    """
    Simple wrapper for learning rate scheduling (Noam-style).
    """

    def __init__(self, optimizer, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps
        ])

    def _update_learning_rate(self):
        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr


class BERTTrainer:
    def __init__(
        self,
        model,
        train_dataloader,
        test_dataloader=None,
        lr=1e-4,
        weight_decay=0.01,
        betas=(0.9, 0.999),
        warmup_steps=800,          # changed from 10000 to 800
        log_freq=10,
        device='cuda'
    ):
        self.device = device
        self.model = model.to(device)
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Optimizer + schedule
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(
            self.optim, self.model.bert.d_model, n_warmup_steps=warmup_steps
        )

        # SEPARATE LOSSES:
        # - NSP: no ignore_index
        # - MLM: ignore_index=0 (PAD / non-predicted tokens)
        self.nsp_criterion = torch.nn.NLLLoss()
        self.mlm_criterion = torch.nn.NLLLoss(ignore_index=0)

        self.log_freq = log_freq
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))

    def train(self, epoch):
        self.iteration(epoch, self.train_data, train=True)

    def test(self, epoch):
        if self.test_data is not None:
            self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        avg_loss = 0.0
        avg_nsp_loss = 0.0
        avg_mlm_loss = 0.0
        total_correct = 0
        total_element = 0

        mode = "train" if train else "test"

        data_iter = tqdm.tqdm(
            enumerate(data_loader),
            desc=f"EP_{mode}:{epoch}",
            total=len(data_loader),
            bar_format="{l_bar}{r_bar}"
        )

        for i, batch in data_iter:
            # Move batch to device
            batch = {key: value.to(self.device) for key, value in batch.items()}

            # Forward
            next_sent_output, mask_lm_output = self.model(
                batch["bert_input"], batch["segment_label"]
            )

            # Losses
            nsp_loss = self.nsp_criterion(next_sent_output, batch["is_next"])
            mlm_loss = self.mlm_criterion(
                mask_lm_output.transpose(1, 2), batch["bert_label"]
            )
            loss = nsp_loss + mlm_loss

            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()

            # NSP accuracy
            correct = next_sent_output.argmax(dim=-1).eq(batch["is_next"]).sum().item()
            total_correct += correct
            total_element += batch["is_next"].nelement()

            avg_loss += loss.item()
            avg_nsp_loss += nsp_loss.item()
            avg_mlm_loss += mlm_loss.item()

            if i % self.log_freq == 0:
                post_fix = {
                    "epoch": epoch,
                    "iter": i,
                    "mode": mode,
                    "avg_loss": avg_loss / (i + 1),
                    "avg_nsp_loss": avg_nsp_loss / (i + 1),
                    "avg_mlm_loss": avg_mlm_loss / (i + 1),
                    "avg_acc": total_correct / total_element * 100,
                    "loss": loss.item()
                }
                data_iter.write(str(post_fix))

            # W&B logging occasionally
            if i % 300 == 0 and train:
                metrics = {
                    f"{mode}/avg_loss": avg_loss / (i + 1),
                    f"{mode}/avg_nsp_loss": avg_nsp_loss / (i + 1),
                    f"{mode}/avg_mlm_loss": avg_mlm_loss / (i + 1),
                    f"{mode}/avg_acc": total_correct / total_element * 100,
                }
                wandb.log(metrics)

        print(
            f"EP{epoch}, {mode}: "
            f"avg_loss={avg_loss / len(data_iter):.4f}, "
            f"nsp_loss={avg_nsp_loss / len(data_iter):.4f}, "
            f"mlm_loss={avg_mlm_loss / len(data_iter):.4f}, "
            f"total_acc={total_correct * 100.0 / total_element:.2f}"
        )


# ==========================
# 6) Save / load helpers
# ==========================

def save_model(model, optimizer, metrics, epoch, path):
    torch.save(
        {
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'metric': metrics,
            'epoch': epoch
        },
        path
    )

def load_model(model, optimizer=None, path='./checkpoint.pth'):
    checkpoint = torch.load(path, map_location='cpu')
    model.load_state_dict(checkpoint['model_state_dict'])
    if optimizer is not None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    metrics = checkpoint['metric']
    return model, optimizer, epoch, metrics


# ==========================
# 7) Build dataset & dataloader (using your email_pairs + tokenizer)
# ==========================

MAX_LEN = 128  # same as before

train_data = BERTDataset(email_pairs, tokenizer, seq_len=MAX_LEN)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, pin_memory=True)

sample_batch = next(iter(train_loader))
print("Batch shapes:", {k: v.shape for k, v in sample_batch.items()})


# ==========================
# 8) Init W&B, Model, Trainer, Training Loop
# ==========================

# Login once at the start of notebook (you might have already done this)
# wandb.login()  # comment out if you don't want W&B tracking

run = wandb.init(
    name="bert-email-pretraining-small",
    project="bert-email-project",
)

print("Initializing smaller BERT model (~100M params)...")
vocab_size = len(tokenizer.vocab)

bert_model = BERT(
    vocab_size=vocab_size,
    d_model=768,
    n_layers=10,   # smaller than 12
    heads=12,
    dropout=0.1,
    seq_len=MAX_LEN
)

bert_lm = BERTLM(bert_model, vocab_size=vocab_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_trainer = BERTTrainer(
    bert_lm,
    train_loader,
    device=device,
    warmup_steps=800
)

# ==========================
# 9) Train
# ==========================

epochs = 5
ckpt_dir = '/content/drive/MyDrive/email_bert/checkpoints'
os.makedirs(ckpt_dir, exist_ok=True)

for epoch in range(epochs):
    bert_trainer.train(epoch)

    epoch_path = os.path.join(ckpt_dir, f'bert_email_epoch_{epoch}.pth')
    save_model(bert_lm, bert_trainer.optim, {"epoch": epoch}, epoch, epoch_path)
    print(f"Saved model after epoch {epoch} to {epoch_path}")

# Final model
final_path = '/content/drive/MyDrive/email_bert/bert_email_final_small.pth'
save_model(bert_lm, bert_trainer.optim, {"final_epoch": epochs}, epochs, final_path)
print(f"Pre-training complete. Final model saved to {final_path}")

wandb.finish()

print("\nModel and data files are saved to Google Drive in the following locations:")
print("- Dataset: /content/drive/MyDrive/email_bert/datasets/")
print("- Tokenizer: /content/drive/MyDrive/email_bert/tokenizer/")
print("- Checkpoints: /content/drive/MyDrive/email_bert/checkpoints/")
print("- Final model: /content/drive/MyDrive/email_bert/bert_email_final_small.pth")

Processing Enron emails...
Created 47680 email pairs
Sample pair: ['If you need anything just let me know', 'Thanks, Crystal AGENT CS/CS BOOKING REF Y4LBHX SCHWIEGER/JAMES ENRON 1400 SMITH HOUSTON TX 77002 DATE: APR 09 2001 SERVICE DATE FROM TO DEPART ARRIVE AIR FRANCE 08AUG PARIS COPENHAGEN 1250P 245P AF 2350 W WED CHARLES DE GAU COPENHAGEN APT EQP: BOEING 737-300 RESERVATION CONFIRMED AIR FRANCE 19AUG COPENHAGEN PARIS 105P 300P AF 2051 W SUN COPENHAGEN APT CHARLES DE GAU EQP: BOEING 737-500 RESERVATION CONFIRMED ===== Vitol Travel Services 1100 Louisiana Suite 3230 Houston, Texas 77002 Phone - 713-759-1444 Fax - 713-759-9006 __________________________________________________ Do You Yahoo']


100%|██████████| 95360/95360 [00:00<00:00, 727034.96it/s]


Created 16 text files for tokenizer training
Sample tokenization: ['[CLS]', 'this', 'is', 'an', 'email', 'about', 'the', 'project', 'deadline', 'tomorrow', '[UNK]', '[SEP]']
Batch shapes: {'bert_input': torch.Size([32, 128]), 'bert_label': torch.Size([32, 128]), 'segment_label': torch.Size([32, 128]), 'is_next': torch.Size([32])}


Initializing smaller BERT model (~100M params)...
Total Parameters: 105188402


EP_train:0:   0%|| 1/1490 [00:04<1:42:03,  4.11s/it]

{'epoch': 0, 'iter': 0, 'mode': 'train', 'avg_loss': 11.223855018615723, 'avg_nsp_loss': 0.7172810435295105, 'avg_mlm_loss': 10.506573677062988, 'avg_acc': 50.0, 'loss': 11.223855018615723}


EP_train:0:   1%|| 11/1490 [00:41<1:29:14,  3.62s/it]

{'epoch': 0, 'iter': 10, 'mode': 'train', 'avg_loss': 10.991244316101074, 'avg_nsp_loss': 0.7203357219696045, 'avg_mlm_loss': 10.27090870250355, 'avg_acc': 51.42045454545454, 'loss': 10.712124824523926}


EP_train:0:   1%|| 21/1490 [01:26<1:56:59,  4.78s/it]

{'epoch': 0, 'iter': 20, 'mode': 'train', 'avg_loss': 10.696552322024392, 'avg_nsp_loss': 0.769838293393453, 'avg_mlm_loss': 9.926714125133696, 'avg_acc': 48.660714285714285, 'loss': 10.287711143493652}


EP_train:0:   2%|| 31/1490 [02:05<1:40:20,  4.13s/it]

{'epoch': 0, 'iter': 30, 'mode': 'train', 'avg_loss': 10.457629326851137, 'avg_nsp_loss': 0.7603911219104644, 'avg_mlm_loss': 9.6972382760817, 'avg_acc': 48.28629032258064, 'loss': 9.58204460144043}


EP_train:0:   3%|| 41/1490 [02:46<1:33:34,  3.87s/it]

{'epoch': 0, 'iter': 40, 'mode': 'train', 'avg_loss': 10.262669679595202, 'avg_nsp_loss': 0.7457008129212914, 'avg_mlm_loss': 9.51696891319461, 'avg_acc': 48.39939024390244, 'loss': 9.229615211486816}


EP_train:0:   3%|| 51/1490 [03:26<1:29:56,  3.75s/it]

{'epoch': 0, 'iter': 50, 'mode': 'train', 'avg_loss': 10.121552018558278, 'avg_nsp_loss': 0.7388112696946836, 'avg_mlm_loss': 9.382740806130801, 'avg_acc': 49.26470588235294, 'loss': 9.812318801879883}


EP_train:0:   4%|| 61/1490 [04:07<1:36:43,  4.06s/it]

{'epoch': 0, 'iter': 60, 'mode': 'train', 'avg_loss': 9.956033534690983, 'avg_nsp_loss': 0.7335801759704215, 'avg_mlm_loss': 9.222453414416703, 'avg_acc': 50.153688524590166, 'loss': 8.66537094116211}


EP_train:0:   5%|| 71/1490 [04:48<1:33:26,  3.95s/it]

{'epoch': 0, 'iter': 70, 'mode': 'train', 'avg_loss': 9.837926582551338, 'avg_nsp_loss': 0.7313494942557643, 'avg_mlm_loss': 9.106577141184202, 'avg_acc': 50.176056338028175, 'loss': 9.066681861877441}


EP_train:0:   5%|| 81/1490 [05:28<1:26:59,  3.70s/it]

{'epoch': 0, 'iter': 80, 'mode': 'train', 'avg_loss': 9.69570600250621, 'avg_nsp_loss': 0.7274815557915487, 'avg_mlm_loss': 8.968224513677903, 'avg_acc': 50.46296296296296, 'loss': 8.73215103149414}


EP_train:0:   6%|| 91/1490 [06:08<1:28:47,  3.81s/it]

{'epoch': 0, 'iter': 90, 'mode': 'train', 'avg_loss': 9.56919433258392, 'avg_nsp_loss': 0.7272202889997881, 'avg_mlm_loss': 8.841974106463757, 'avg_acc': 50.480769230769226, 'loss': 8.57345962524414}


EP_train:0:   7%|| 101/1490 [06:47<1:26:38,  3.74s/it]

{'epoch': 0, 'iter': 100, 'mode': 'train', 'avg_loss': 9.467174265644339, 'avg_nsp_loss': 0.7319030360420151, 'avg_mlm_loss': 8.735271312222622, 'avg_acc': 50.92821782178218, 'loss': 8.538983345031738}


EP_train:0:   7%|| 111/1490 [07:28<1:33:28,  4.07s/it]

{'epoch': 0, 'iter': 110, 'mode': 'train', 'avg_loss': 9.357135248613787, 'avg_nsp_loss': 0.738309858081577, 'avg_mlm_loss': 8.61882546570924, 'avg_acc': 50.87274774774775, 'loss': 7.5191240310668945}


EP_train:0:   8%|| 121/1490 [08:08<1:28:30,  3.88s/it]

{'epoch': 0, 'iter': 120, 'mode': 'train', 'avg_loss': 9.26612542286392, 'avg_nsp_loss': 0.7454993483448816, 'avg_mlm_loss': 8.52062613116808, 'avg_acc': 50.72314049586777, 'loss': 8.160480499267578}


EP_train:0:   9%|| 131/1490 [08:48<1:29:50,  3.97s/it]

{'epoch': 0, 'iter': 130, 'mode': 'train', 'avg_loss': 9.167824312020803, 'avg_nsp_loss': 0.745356559753418, 'avg_mlm_loss': 8.422467799587103, 'avg_acc': 50.57251908396947, 'loss': 7.654578685760498}


EP_train:0:   9%|| 141/1490 [09:26<1:26:07,  3.83s/it]

{'epoch': 0, 'iter': 140, 'mode': 'train', 'avg_loss': 9.078051310059026, 'avg_nsp_loss': 0.7431012121498162, 'avg_mlm_loss': 8.33495012919108, 'avg_acc': 50.84219858156028, 'loss': 7.9031476974487305}


EP_train:0:  10%|| 151/1490 [10:07<1:27:47,  3.93s/it]

{'epoch': 0, 'iter': 150, 'mode': 'train', 'avg_loss': 8.998899532469693, 'avg_nsp_loss': 0.7408879221669885, 'avg_mlm_loss': 8.258011641091858, 'avg_acc': 50.724337748344375, 'loss': 7.85126256942749}


EP_train:0:  11%|| 161/1490 [10:46<1:29:03,  4.02s/it]

{'epoch': 0, 'iter': 160, 'mode': 'train', 'avg_loss': 8.931557101492556, 'avg_nsp_loss': 0.7384130273546491, 'avg_mlm_loss': 8.19314410227426, 'avg_acc': 50.524068322981364, 'loss': 6.544471263885498}


EP_train:0:  11%|| 171/1490 [11:25<1:24:00,  3.82s/it]

{'epoch': 0, 'iter': 170, 'mode': 'train', 'avg_loss': 8.875143413655241, 'avg_nsp_loss': 0.7362993415336163, 'avg_mlm_loss': 8.138844094081232, 'avg_acc': 50.877192982456144, 'loss': 8.300592422485352}


EP_train:0:  12%|| 181/1490 [12:05<1:22:36,  3.79s/it]

{'epoch': 0, 'iter': 180, 'mode': 'train', 'avg_loss': 8.832688982315485, 'avg_nsp_loss': 0.7389496713053456, 'avg_mlm_loss': 8.093739330439277, 'avg_acc': 50.776933701657455, 'loss': 7.6406378746032715}


EP_train:0:  13%|| 191/1490 [12:46<1:33:04,  4.30s/it]

{'epoch': 0, 'iter': 190, 'mode': 'train', 'avg_loss': 8.779158018022308, 'avg_nsp_loss': 0.7373222265568079, 'avg_mlm_loss': 8.041835814870465, 'avg_acc': 50.752617801047116, 'loss': 7.923823356628418}


EP_train:0:  13%|| 201/1490 [13:27<1:26:00,  4.00s/it]

{'epoch': 0, 'iter': 200, 'mode': 'train', 'avg_loss': 8.729704610150845, 'avg_nsp_loss': 0.736212558710753, 'avg_mlm_loss': 7.993492071901388, 'avg_acc': 50.99502487562189, 'loss': 7.790792942047119}


EP_train:0:  14%|| 211/1490 [14:10<1:29:20,  4.19s/it]

{'epoch': 0, 'iter': 210, 'mode': 'train', 'avg_loss': 8.690603837017765, 'avg_nsp_loss': 0.7339077639918756, 'avg_mlm_loss': 7.956696089975077, 'avg_acc': 51.095971563981045, 'loss': 8.0435152053833}


EP_train:0:  15%|| 221/1490 [14:49<1:19:51,  3.78s/it]

{'epoch': 0, 'iter': 220, 'mode': 'train', 'avg_loss': 8.657824533557461, 'avg_nsp_loss': 0.7358283411323755, 'avg_mlm_loss': 7.921996209416454, 'avg_acc': 51.003959276018094, 'loss': 7.792037010192871}


EP_train:0:  16%|| 231/1490 [15:29<1:25:09,  4.06s/it]

{'epoch': 0, 'iter': 230, 'mode': 'train', 'avg_loss': 8.631641278535257, 'avg_nsp_loss': 0.7363623741901282, 'avg_mlm_loss': 7.895278922407142, 'avg_acc': 51.041666666666664, 'loss': 7.980790615081787}


EP_train:0:  16%|| 241/1490 [16:09<1:19:54,  3.84s/it]

{'epoch': 0, 'iter': 240, 'mode': 'train', 'avg_loss': 8.601271385968474, 'avg_nsp_loss': 0.7386122622925216, 'avg_mlm_loss': 7.8626591397518935, 'avg_acc': 50.829875518672196, 'loss': 8.381003379821777}


EP_train:0:  17%|| 251/1490 [16:52<1:24:12,  4.08s/it]

{'epoch': 0, 'iter': 250, 'mode': 'train', 'avg_loss': 8.570665840133728, 'avg_nsp_loss': 0.7401599698807614, 'avg_mlm_loss': 7.83050589162515, 'avg_acc': 50.85906374501992, 'loss': 8.316558837890625}


EP_train:0:  18%|| 261/1490 [17:33<1:24:51,  4.14s/it]

{'epoch': 0, 'iter': 260, 'mode': 'train', 'avg_loss': 8.552506666073853, 'avg_nsp_loss': 0.7404188251129968, 'avg_mlm_loss': 7.8120878647113665, 'avg_acc': 50.89798850574713, 'loss': 8.20263671875}


EP_train:0:  18%|| 271/1490 [18:17<1:30:59,  4.48s/it]

{'epoch': 0, 'iter': 270, 'mode': 'train', 'avg_loss': 8.518018502590841, 'avg_nsp_loss': 0.7389411981255366, 'avg_mlm_loss': 7.779077332837995, 'avg_acc': 50.87638376383764, 'loss': 6.346707344055176}


EP_train:0:  19%|| 281/1490 [18:58<1:24:37,  4.20s/it]

{'epoch': 0, 'iter': 280, 'mode': 'train', 'avg_loss': 8.493241747927412, 'avg_nsp_loss': 0.7384251269157247, 'avg_mlm_loss': 7.754816650920067, 'avg_acc': 50.63389679715302, 'loss': 7.931824684143066}


EP_train:0:  20%|| 291/1490 [19:38<1:23:26,  4.18s/it]

{'epoch': 0, 'iter': 290, 'mode': 'train', 'avg_loss': 8.476344120051852, 'avg_nsp_loss': 0.7370294170690975, 'avg_mlm_loss': 7.739314733092318, 'avg_acc': 50.62285223367697, 'loss': 8.361968994140625}


EP_train:0:  20%|| 301/1490 [20:17<1:15:29,  3.81s/it]

{'epoch': 0, 'iter': 300, 'mode': 'train', 'avg_loss': 8.459890411541707, 'avg_nsp_loss': 0.7354411219441613, 'avg_mlm_loss': 7.724449319300858, 'avg_acc': 50.75789036544851, 'loss': 8.279341697692871}


EP_train:0:  21%|| 311/1490 [20:58<1:16:59,  3.92s/it]

{'epoch': 0, 'iter': 310, 'mode': 'train', 'avg_loss': 8.44446850896265, 'avg_nsp_loss': 0.7358453605336008, 'avg_mlm_loss': 7.708623179285473, 'avg_acc': 50.67323151125402, 'loss': 7.770530700683594}


EP_train:0:  22%|| 321/1490 [21:38<1:12:43,  3.73s/it]

{'epoch': 0, 'iter': 320, 'mode': 'train', 'avg_loss': 8.428997635470001, 'avg_nsp_loss': 0.735447652614748, 'avg_mlm_loss': 7.693550017763893, 'avg_acc': 50.77881619937694, 'loss': 8.11861801147461}


EP_train:0:  22%|| 331/1490 [22:18<1:13:55,  3.83s/it]

{'epoch': 0, 'iter': 330, 'mode': 'train', 'avg_loss': 8.410637478093726, 'avg_nsp_loss': 0.7342134422405969, 'avg_mlm_loss': 7.676424068266532, 'avg_acc': 50.80249244712991, 'loss': 7.7792134284973145}


EP_train:0:  23%|| 341/1490 [22:56<1:11:22,  3.73s/it]

{'epoch': 0, 'iter': 340, 'mode': 'train', 'avg_loss': 8.396429703732041, 'avg_nsp_loss': 0.7330519539519839, 'avg_mlm_loss': 7.663377780019363, 'avg_acc': 50.82478005865103, 'loss': 8.208969116210938}


EP_train:0:  24%|| 351/1490 [23:36<1:15:19,  3.97s/it]

{'epoch': 0, 'iter': 350, 'mode': 'train', 'avg_loss': 8.385867702994931, 'avg_nsp_loss': 0.7326682642993764, 'avg_mlm_loss': 7.653199467563901, 'avg_acc': 50.78347578347579, 'loss': 7.855780124664307}


EP_train:0:  24%|| 361/1490 [24:15<1:15:33,  4.02s/it]

{'epoch': 0, 'iter': 360, 'mode': 'train', 'avg_loss': 8.37387028691511, 'avg_nsp_loss': 0.7325658677687605, 'avg_mlm_loss': 7.6413044440779325, 'avg_acc': 50.77908587257618, 'loss': 8.152135848999023}


EP_train:0:  25%|| 371/1490 [24:54<1:10:50,  3.80s/it]

{'epoch': 0, 'iter': 370, 'mode': 'train', 'avg_loss': 8.36392705845383, 'avg_nsp_loss': 0.7338160295692094, 'avg_mlm_loss': 7.630111055554084, 'avg_acc': 50.70754716981132, 'loss': 7.710111141204834}


EP_train:0:  26%|| 381/1490 [25:34<1:15:52,  4.10s/it]

{'epoch': 0, 'iter': 380, 'mode': 'train', 'avg_loss': 8.352658415716776, 'avg_nsp_loss': 0.7330863768347292, 'avg_mlm_loss': 7.619572065007969, 'avg_acc': 50.74639107611548, 'loss': 7.748642444610596}


EP_train:0:  26%|| 391/1490 [26:17<1:12:38,  3.97s/it]

{'epoch': 0, 'iter': 390, 'mode': 'train', 'avg_loss': 8.341168960951784, 'avg_nsp_loss': 0.7325901890654698, 'avg_mlm_loss': 7.608578796581844, 'avg_acc': 50.72730179028133, 'loss': 8.37648868560791}


EP_train:0:  27%|| 401/1490 [26:57<1:12:40,  4.00s/it]

{'epoch': 0, 'iter': 400, 'mode': 'train', 'avg_loss': 8.333237625416972, 'avg_nsp_loss': 0.7319576558924078, 'avg_mlm_loss': 7.601279994793367, 'avg_acc': 50.646820448877804, 'loss': 8.188413619995117}


EP_train:0:  28%|| 411/1490 [27:38<1:11:56,  4.00s/it]

{'epoch': 0, 'iter': 410, 'mode': 'train', 'avg_loss': 8.322673256959938, 'avg_nsp_loss': 0.7315812355990537, 'avg_mlm_loss': 7.591092047030038, 'avg_acc': 50.57025547445255, 'loss': 7.775815963745117}


EP_train:0:  28%|| 421/1490 [28:22<1:14:17,  4.17s/it]

{'epoch': 0, 'iter': 420, 'mode': 'train', 'avg_loss': 8.31141072610778, 'avg_nsp_loss': 0.7307961989468463, 'avg_mlm_loss': 7.580614550946161, 'avg_acc': 50.504750593824234, 'loss': 8.446849822998047}


EP_train:0:  29%|| 431/1490 [29:03<1:11:39,  4.06s/it]

{'epoch': 0, 'iter': 430, 'mode': 'train', 'avg_loss': 8.297059649934349, 'avg_nsp_loss': 0.7304051577906597, 'avg_mlm_loss': 7.566654515100466, 'avg_acc': 50.435034802784216, 'loss': 8.060946464538574}


EP_train:0:  30%|| 441/1490 [29:42<1:13:09,  4.18s/it]

{'epoch': 0, 'iter': 440, 'mode': 'train', 'avg_loss': 8.29232335360953, 'avg_nsp_loss': 0.7297290779836053, 'avg_mlm_loss': 7.562594298062141, 'avg_acc': 50.48185941043084, 'loss': 8.112349510192871}


EP_train:0:  30%|| 451/1490 [30:21<1:11:53,  4.15s/it]

{'epoch': 0, 'iter': 450, 'mode': 'train', 'avg_loss': 8.284523519867012, 'avg_nsp_loss': 0.7293736346545082, 'avg_mlm_loss': 7.555149907283402, 'avg_acc': 50.45038802660754, 'loss': 7.01638650894165}


EP_train:0:  31%|| 461/1490 [31:02<1:06:45,  3.89s/it]

{'epoch': 0, 'iter': 460, 'mode': 'train', 'avg_loss': 8.277096024304305, 'avg_nsp_loss': 0.7294140948132165, 'avg_mlm_loss': 7.547681951212521, 'avg_acc': 50.4135032537961, 'loss': 8.330558776855469}


EP_train:0:  32%|| 471/1490 [31:41<1:04:30,  3.80s/it]

{'epoch': 0, 'iter': 470, 'mode': 'train', 'avg_loss': 8.268817807458769, 'avg_nsp_loss': 0.7291572548275035, 'avg_mlm_loss': 7.539660573258774, 'avg_acc': 50.484341825902334, 'loss': 8.192851066589355}


EP_train:0:  32%|| 481/1490 [32:23<1:08:26,  4.07s/it]

{'epoch': 0, 'iter': 480, 'mode': 'train', 'avg_loss': 8.257645396829394, 'avg_nsp_loss': 0.7294911334519575, 'avg_mlm_loss': 7.52815428295651, 'avg_acc': 50.44828482328483, 'loss': 8.239912986755371}


EP_train:0:  33%|| 491/1490 [33:02<1:00:45,  3.65s/it]

{'epoch': 0, 'iter': 490, 'mode': 'train', 'avg_loss': 8.254280099072428, 'avg_nsp_loss': 0.7292796733908643, 'avg_mlm_loss': 7.525000444012115, 'avg_acc': 50.337321792260695, 'loss': 8.362977027893066}


EP_train:0:  34%|| 501/1490 [33:42<1:06:12,  4.02s/it]

{'epoch': 0, 'iter': 500, 'mode': 'train', 'avg_loss': 8.24412830527909, 'avg_nsp_loss': 0.7288885855389212, 'avg_mlm_loss': 7.515239737466899, 'avg_acc': 50.330588822355296, 'loss': 7.751190662384033}


EP_train:0:  34%|| 511/1490 [34:23<1:06:29,  4.07s/it]

{'epoch': 0, 'iter': 510, 'mode': 'train', 'avg_loss': 8.233372188127671, 'avg_nsp_loss': 0.7284901306587189, 'avg_mlm_loss': 7.504882076248498, 'avg_acc': 50.34246575342466, 'loss': 7.789021015167236}


EP_train:0:  35%|| 521/1490 [35:06<1:07:44,  4.19s/it]

{'epoch': 0, 'iter': 520, 'mode': 'train', 'avg_loss': 8.225971670388718, 'avg_nsp_loss': 0.7285871246039525, 'avg_mlm_loss': 7.497384564661476, 'avg_acc': 50.3478886756238, 'loss': 7.279838562011719}


EP_train:0:  36%|| 531/1490 [35:47<1:03:34,  3.98s/it]

{'epoch': 0, 'iter': 530, 'mode': 'train', 'avg_loss': 8.222347774972574, 'avg_nsp_loss': 0.7284489706186701, 'avg_mlm_loss': 7.493898826354865, 'avg_acc': 50.33545197740112, 'loss': 8.233964920043945}


EP_train:0:  36%|| 541/1490 [36:28<1:02:20,  3.94s/it]

{'epoch': 0, 'iter': 540, 'mode': 'train', 'avg_loss': 8.219598914691128, 'avg_nsp_loss': 0.7280285512673876, 'avg_mlm_loss': 7.491570385458729, 'avg_acc': 50.36968576709797, 'loss': 7.97299337387085}


EP_train:0:  37%|| 551/1490 [37:07<1:01:45,  3.95s/it]

{'epoch': 0, 'iter': 550, 'mode': 'train', 'avg_loss': 8.212386663509584, 'avg_nsp_loss': 0.727582776719559, 'avg_mlm_loss': 7.484803909398683, 'avg_acc': 50.351633393829395, 'loss': 7.600668430328369}


EP_train:0:  38%|| 561/1490 [37:47<1:02:38,  4.05s/it]

{'epoch': 0, 'iter': 560, 'mode': 'train', 'avg_loss': 8.20570782629139, 'avg_nsp_loss': 0.727132442482015, 'avg_mlm_loss': 7.478575407289991, 'avg_acc': 50.40663992869875, 'loss': 7.962362766265869}


EP_train:0:  38%|| 571/1490 [38:30<1:02:00,  4.05s/it]

{'epoch': 0, 'iter': 570, 'mode': 'train', 'avg_loss': 8.194544575051543, 'avg_nsp_loss': 0.7267946719914519, 'avg_mlm_loss': 7.467749925503171, 'avg_acc': 50.47613835376532, 'loss': 6.2997050285339355}


EP_train:0:  39%|| 581/1490 [39:08<1:00:13,  3.98s/it]

{'epoch': 0, 'iter': 580, 'mode': 'train', 'avg_loss': 8.189856174685664, 'avg_nsp_loss': 0.7261697115463152, 'avg_mlm_loss': 7.463686484272837, 'avg_acc': 50.51635111876076, 'loss': 7.929230690002441}


EP_train:0:  40%|| 591/1490 [39:53<1:02:50,  4.19s/it]

{'epoch': 0, 'iter': 590, 'mode': 'train', 'avg_loss': 8.179388870845996, 'avg_nsp_loss': 0.7259029054197966, 'avg_mlm_loss': 7.453485985899536, 'avg_acc': 50.52876480541455, 'loss': 8.04755687713623}


EP_train:0:  40%|| 601/1490 [40:31<58:28,  3.95s/it]  

{'epoch': 0, 'iter': 600, 'mode': 'train', 'avg_loss': 8.178077838980219, 'avg_nsp_loss': 0.7256078865881173, 'avg_mlm_loss': 7.452469973318192, 'avg_acc': 50.50956738768719, 'loss': 8.419886589050293}


EP_train:0:  41%|| 611/1490 [41:10<59:12,  4.04s/it]

{'epoch': 0, 'iter': 610, 'mode': 'train', 'avg_loss': 8.174012095175087, 'avg_nsp_loss': 0.7253511494388362, 'avg_mlm_loss': 7.448660966027193, 'avg_acc': 50.56771685761048, 'loss': 7.747840404510498}


EP_train:0:  42%|| 621/1490 [41:55<1:06:37,  4.60s/it]

{'epoch': 0, 'iter': 620, 'mode': 'train', 'avg_loss': 8.169367572919565, 'avg_nsp_loss': 0.7251221386898735, 'avg_mlm_loss': 7.444245452850145, 'avg_acc': 50.58373590982287, 'loss': 7.511226654052734}


EP_train:0:  42%|| 631/1490 [42:34<56:00,  3.91s/it]

{'epoch': 0, 'iter': 630, 'mode': 'train', 'avg_loss': 8.166623635450748, 'avg_nsp_loss': 0.7246852295145557, 'avg_mlm_loss': 7.441938422939101, 'avg_acc': 50.63886687797148, 'loss': 8.000443458557129}


EP_train:0:  43%|| 641/1490 [43:12<53:48,  3.80s/it]

{'epoch': 0, 'iter': 640, 'mode': 'train', 'avg_loss': 8.161022385643351, 'avg_nsp_loss': 0.7245125431166424, 'avg_mlm_loss': 7.436509859171375, 'avg_acc': 50.59964898595943, 'loss': 7.937376499176025}


EP_train:0:  44%|| 651/1490 [43:54<56:35,  4.05s/it]

{'epoch': 0, 'iter': 650, 'mode': 'train', 'avg_loss': 8.155371842479559, 'avg_nsp_loss': 0.724661134629755, 'avg_mlm_loss': 7.430710722956972, 'avg_acc': 50.65284178187404, 'loss': 7.587891578674316}


EP_train:0:  44%|| 661/1490 [44:35<55:38,  4.03s/it]

{'epoch': 0, 'iter': 660, 'mode': 'train', 'avg_loss': 8.149964600936729, 'avg_nsp_loss': 0.7244618922787607, 'avg_mlm_loss': 7.42550272443832, 'avg_acc': 50.66187594553706, 'loss': 8.629693984985352}


EP_train:0:  45%|| 671/1490 [45:15<53:09,  3.89s/it]

{'epoch': 0, 'iter': 670, 'mode': 'train', 'avg_loss': 8.144855428203918, 'avg_nsp_loss': 0.7241402366122262, 'avg_mlm_loss': 7.420715207669845, 'avg_acc': 50.62872578241431, 'loss': 7.905218124389648}


EP_train:0:  46%|| 681/1490 [45:56<58:21,  4.33s/it]

{'epoch': 0, 'iter': 680, 'mode': 'train', 'avg_loss': 8.144849770219666, 'avg_nsp_loss': 0.7237818368906141, 'avg_mlm_loss': 7.421067949258633, 'avg_acc': 50.63325991189427, 'loss': 7.4387922286987305}


EP_train:0:  46%|| 691/1490 [46:35<47:45,  3.59s/it]

{'epoch': 0, 'iter': 690, 'mode': 'train', 'avg_loss': 8.164939524295878, 'avg_nsp_loss': 0.7237847722560038, 'avg_mlm_loss': 7.441154767048859, 'avg_acc': 50.56530390738061, 'loss': 8.016597747802734}


EP_train:0:  47%|| 701/1490 [47:14<51:13,  3.90s/it]

{'epoch': 0, 'iter': 700, 'mode': 'train', 'avg_loss': 8.163076923849239, 'avg_nsp_loss': 0.7236032517422283, 'avg_mlm_loss': 7.439473686136635, 'avg_acc': 50.53049215406562, 'loss': 8.081499099731445}


EP_train:0:  48%|| 711/1490 [47:52<47:10,  3.63s/it]

{'epoch': 0, 'iter': 710, 'mode': 'train', 'avg_loss': 8.159438207012021, 'avg_nsp_loss': 0.7232178202325953, 'avg_mlm_loss': 7.436220400444063, 'avg_acc': 50.527426160337555, 'loss': 8.019624710083008}


EP_train:0:  48%|| 721/1490 [48:33<51:23,  4.01s/it]

{'epoch': 0, 'iter': 720, 'mode': 'train', 'avg_loss': 8.15588016549691, 'avg_nsp_loss': 0.7228754699312864, 'avg_mlm_loss': 7.433004707966036, 'avg_acc': 50.511442441054086, 'loss': 7.698578834533691}


EP_train:0:  49%|| 731/1490 [49:15<52:21,  4.14s/it]

{'epoch': 0, 'iter': 730, 'mode': 'train', 'avg_loss': 8.152386580544196, 'avg_nsp_loss': 0.7226422785408031, 'avg_mlm_loss': 7.429744314071091, 'avg_acc': 50.48734610123119, 'loss': 7.412520885467529}


EP_train:0:  50%|| 741/1490 [49:56<49:58,  4.00s/it]

{'epoch': 0, 'iter': 740, 'mode': 'train', 'avg_loss': 8.149462678010968, 'avg_nsp_loss': 0.7223146707422821, 'avg_mlm_loss': 7.4271480203801, 'avg_acc': 50.50185560053981, 'loss': 8.035728454589844}


EP_train:0:  50%|| 751/1490 [50:37<49:26,  4.01s/it]

{'epoch': 0, 'iter': 750, 'mode': 'train', 'avg_loss': 8.149431013394292, 'avg_nsp_loss': 0.7220444244328891, 'avg_mlm_loss': 7.4273866013426595, 'avg_acc': 50.42027296937417, 'loss': 8.354372024536133}


EP_train:0:  51%|| 761/1490 [51:17<49:52,  4.11s/it]

{'epoch': 0, 'iter': 760, 'mode': 'train', 'avg_loss': 8.145660404150181, 'avg_nsp_loss': 0.7217076508506995, 'avg_mlm_loss': 7.4239527643431815, 'avg_acc': 50.45170827858082, 'loss': 7.6417341232299805}


EP_train:0:  52%|| 771/1490 [51:54<42:53,  3.58s/it]

{'epoch': 0, 'iter': 770, 'mode': 'train', 'avg_loss': 8.143703766833948, 'avg_nsp_loss': 0.7213923844536609, 'avg_mlm_loss': 7.4223113938219045, 'avg_acc': 50.48232814526589, 'loss': 7.648343086242676}


EP_train:0:  52%|| 781/1490 [52:38<55:51,  4.73s/it]

{'epoch': 0, 'iter': 780, 'mode': 'train', 'avg_loss': 8.139920142487588, 'avg_nsp_loss': 0.721376505765048, 'avg_mlm_loss': 7.418543646491291, 'avg_acc': 50.50416133162612, 'loss': 6.654184341430664}


EP_train:0:  53%|| 791/1490 [53:18<47:33,  4.08s/it]

{'epoch': 0, 'iter': 790, 'mode': 'train', 'avg_loss': 8.137070649191642, 'avg_nsp_loss': 0.721111549740344, 'avg_mlm_loss': 7.415959109247258, 'avg_acc': 50.533343868520866, 'loss': 7.818551063537598}


EP_train:0:  54%|| 801/1490 [53:58<46:43,  4.07s/it]

{'epoch': 0, 'iter': 800, 'mode': 'train', 'avg_loss': 8.135162077295348, 'avg_nsp_loss': 0.7208203783493661, 'avg_mlm_loss': 7.41434170928936, 'avg_acc': 50.530586766541816, 'loss': 8.202932357788086}


EP_train:0:  54%|| 811/1490 [54:38<44:00,  3.89s/it]

{'epoch': 0, 'iter': 810, 'mode': 'train', 'avg_loss': 8.134034236762144, 'avg_nsp_loss': 0.7205134658866393, 'avg_mlm_loss': 7.413520782414259, 'avg_acc': 50.52404438964242, 'loss': 7.968865394592285}


EP_train:0:  55%|| 821/1490 [55:23<52:08,  4.68s/it]

{'epoch': 0, 'iter': 820, 'mode': 'train', 'avg_loss': 8.1282107031447, 'avg_nsp_loss': 0.7203844604190172, 'avg_mlm_loss': 7.407826255212893, 'avg_acc': 50.43392204628502, 'loss': 6.921666145324707}


EP_train:0:  56%|| 831/1490 [56:02<43:07,  3.93s/it]

{'epoch': 0, 'iter': 830, 'mode': 'train', 'avg_loss': 8.127613094022271, 'avg_nsp_loss': 0.7205033257119492, 'avg_mlm_loss': 7.407109778710651, 'avg_acc': 50.38733453670277, 'loss': 8.078652381896973}


EP_train:0:  56%|| 841/1490 [56:41<45:35,  4.21s/it]

{'epoch': 0, 'iter': 840, 'mode': 'train', 'avg_loss': 8.12595433599175, 'avg_nsp_loss': 0.7201816842190293, 'avg_mlm_loss': 7.4057726604901655, 'avg_acc': 50.416171224732466, 'loss': 7.545119762420654}


EP_train:0:  57%|| 851/1490 [57:19<40:14,  3.78s/it]

{'epoch': 0, 'iter': 850, 'mode': 'train', 'avg_loss': 8.122836294521596, 'avg_nsp_loss': 0.7198780728843321, 'avg_mlm_loss': 7.40295823004215, 'avg_acc': 50.411280846063455, 'loss': 7.964900493621826}


EP_train:0:  58%|| 861/1490 [57:57<40:31,  3.87s/it]

{'epoch': 0, 'iter': 860, 'mode': 'train', 'avg_loss': 8.122779671738787, 'avg_nsp_loss': 0.7195530992074849, 'avg_mlm_loss': 7.403226579938616, 'avg_acc': 50.439169570267126, 'loss': 7.920331954956055}


EP_train:0:  58%|| 871/1490 [58:37<39:48,  3.86s/it]

{'epoch': 0, 'iter': 870, 'mode': 'train', 'avg_loss': 8.11927389906961, 'avg_nsp_loss': 0.7193009396234418, 'avg_mlm_loss': 7.399972966563141, 'avg_acc': 50.41260045924225, 'loss': 7.3928542137146}


EP_train:0:  59%|| 881/1490 [59:17<42:53,  4.23s/it]

{'epoch': 0, 'iter': 880, 'mode': 'train', 'avg_loss': 8.11538768004072, 'avg_nsp_loss': 0.7191826833769358, 'avg_mlm_loss': 7.39620500471481, 'avg_acc': 50.38308740068105, 'loss': 7.6131062507629395}


EP_train:0:  60%|| 891/1490 [59:58<42:08,  4.22s/it]

{'epoch': 0, 'iter': 890, 'mode': 'train', 'avg_loss': 8.114244892407212, 'avg_nsp_loss': 0.7192992129844983, 'avg_mlm_loss': 7.394945687851655, 'avg_acc': 50.33670033670033, 'loss': 7.870118618011475}


EP_train:0:  60%|| 901/1490 [1:00:40<39:12,  3.99s/it]

{'epoch': 0, 'iter': 900, 'mode': 'train', 'avg_loss': 8.110452702783718, 'avg_nsp_loss': 0.7192136562624729, 'avg_mlm_loss': 7.391239053533556, 'avg_acc': 50.35030521642619, 'loss': 7.715045928955078}


EP_train:0:  61%|| 911/1490 [1:01:21<40:27,  4.19s/it]

{'epoch': 0, 'iter': 910, 'mode': 'train', 'avg_loss': 8.107559237600551, 'avg_nsp_loss': 0.7191097772343884, 'avg_mlm_loss': 7.388449466843506, 'avg_acc': 50.34302963776071, 'loss': 7.7542877197265625}


EP_train:0:  62%|| 921/1490 [1:02:05<43:37,  4.60s/it]

{'epoch': 0, 'iter': 920, 'mode': 'train', 'avg_loss': 8.103679739820581, 'avg_nsp_loss': 0.7189339355329996, 'avg_mlm_loss': 7.384745809594402, 'avg_acc': 50.37323561346363, 'loss': 7.534746170043945}


EP_train:0:  62%|| 931/1490 [1:02:42<33:20,  3.58s/it]

{'epoch': 0, 'iter': 930, 'mode': 'train', 'avg_loss': 8.101801613091654, 'avg_nsp_loss': 0.7188020137915934, 'avg_mlm_loss': 7.382999604549879, 'avg_acc': 50.32559076262084, 'loss': 8.105178833007812}


EP_train:0:  63%|| 941/1490 [1:03:24<36:19,  3.97s/it]

{'epoch': 0, 'iter': 940, 'mode': 'train', 'avg_loss': 8.100995554300727, 'avg_nsp_loss': 0.7186901112038575, 'avg_mlm_loss': 7.382305448797633, 'avg_acc': 50.29556323060574, 'loss': 8.128337860107422}


EP_train:0:  64%|| 951/1490 [1:04:02<32:32,  3.62s/it]

{'epoch': 0, 'iter': 950, 'mode': 'train', 'avg_loss': 8.098134970439345, 'avg_nsp_loss': 0.7185178583352724, 'avg_mlm_loss': 7.379617116930106, 'avg_acc': 50.33845951629863, 'loss': 7.808838844299316}


EP_train:0:  64%|| 961/1490 [1:04:44<36:37,  4.15s/it]

{'epoch': 0, 'iter': 960, 'mode': 'train', 'avg_loss': 8.095691473997594, 'avg_nsp_loss': 0.718274993653352, 'avg_mlm_loss': 7.3774164851820805, 'avg_acc': 50.31867845993756, 'loss': 8.377885818481445}


EP_train:0:  65%|| 971/1490 [1:05:23<35:52,  4.15s/it]

{'epoch': 0, 'iter': 970, 'mode': 'train', 'avg_loss': 8.095096504651432, 'avg_nsp_loss': 0.7180911720597044, 'avg_mlm_loss': 7.377005337441127, 'avg_acc': 50.302523171987644, 'loss': 7.894326210021973}


EP_train:0:  66%|| 981/1490 [1:06:04<39:23,  4.64s/it]

{'epoch': 0, 'iter': 980, 'mode': 'train', 'avg_loss': 8.092057092960214, 'avg_nsp_loss': 0.7178514306941407, 'avg_mlm_loss': 7.374205666093895, 'avg_acc': 50.321738022426096, 'loss': 7.179612159729004}


EP_train:0:  67%|| 991/1490 [1:06:44<34:17,  4.12s/it]

{'epoch': 0, 'iter': 990, 'mode': 'train', 'avg_loss': 8.089475496024585, 'avg_nsp_loss': 0.7176309479353526, 'avg_mlm_loss': 7.37184455199872, 'avg_acc': 50.324798183652874, 'loss': 8.446374893188477}


EP_train:0:  67%|| 1001/1490 [1:07:25<32:18,  3.96s/it]

{'epoch': 0, 'iter': 1000, 'mode': 'train', 'avg_loss': 8.085861198909276, 'avg_nsp_loss': 0.7174823329998896, 'avg_mlm_loss': 7.368378870732539, 'avg_acc': 50.2997002997003, 'loss': 7.873518466949463}


EP_train:0:  68%|| 1011/1490 [1:08:07<33:57,  4.25s/it]

{'epoch': 0, 'iter': 1010, 'mode': 'train', 'avg_loss': 8.083008324239186, 'avg_nsp_loss': 0.7172800636197174, 'avg_mlm_loss': 7.365728266043432, 'avg_acc': 50.29673590504451, 'loss': 8.320050239562988}


EP_train:0:  69%|| 1021/1490 [1:08:47<34:18,  4.39s/it]

{'epoch': 0, 'iter': 1020, 'mode': 'train', 'avg_loss': 8.080742443226693, 'avg_nsp_loss': 0.7170533957607482, 'avg_mlm_loss': 7.363689052720028, 'avg_acc': 50.303011753183156, 'loss': 7.4965901374816895}


EP_train:0:  69%|| 1031/1490 [1:09:27<30:25,  3.98s/it]

{'epoch': 0, 'iter': 1030, 'mode': 'train', 'avg_loss': 8.07895348953114, 'avg_nsp_loss': 0.7168424545735785, 'avg_mlm_loss': 7.362111040276306, 'avg_acc': 50.333414161008726, 'loss': 8.275266647338867}


EP_train:0:  70%|| 1041/1490 [1:10:07<30:23,  4.06s/it]

{'epoch': 0, 'iter': 1040, 'mode': 'train', 'avg_loss': 8.077324582337187, 'avg_nsp_loss': 0.7167260165287828, 'avg_mlm_loss': 7.3605985710188016, 'avg_acc': 50.324207492795395, 'loss': 8.364435195922852}


EP_train:0:  71%|| 1051/1490 [1:10:49<32:52,  4.49s/it]

{'epoch': 0, 'iter': 1050, 'mode': 'train', 'avg_loss': 8.075459616848903, 'avg_nsp_loss': 0.7165941623820224, 'avg_mlm_loss': 7.358865457983044, 'avg_acc': 50.30328258801142, 'loss': 7.722625255584717}


EP_train:0:  71%|| 1061/1490 [1:11:32<31:58,  4.47s/it]

{'epoch': 0, 'iter': 1060, 'mode': 'train', 'avg_loss': 8.073152175835007, 'avg_nsp_loss': 0.7164252991950478, 'avg_mlm_loss': 7.3567268801229835, 'avg_acc': 50.30926013195099, 'loss': 8.16182804107666}


EP_train:0:  72%|| 1071/1490 [1:12:08<26:33,  3.80s/it]

{'epoch': 0, 'iter': 1070, 'mode': 'train', 'avg_loss': 8.07282386952533, 'avg_nsp_loss': 0.7162361019243156, 'avg_mlm_loss': 7.356587771496741, 'avg_acc': 50.309290382819796, 'loss': 8.058789253234863}


EP_train:0:  73%|| 1081/1490 [1:12:50<28:53,  4.24s/it]

{'epoch': 0, 'iter': 1080, 'mode': 'train', 'avg_loss': 8.069414548141664, 'avg_nsp_loss': 0.7160332938458057, 'avg_mlm_loss': 7.353381257934994, 'avg_acc': 50.306429232192414, 'loss': 6.855548858642578}


EP_train:0:  73%|| 1091/1490 [1:13:29<26:51,  4.04s/it]

{'epoch': 0, 'iter': 1090, 'mode': 'train', 'avg_loss': 8.06771952028563, 'avg_nsp_loss': 0.7158729839390511, 'avg_mlm_loss': 7.351846539351396, 'avg_acc': 50.303620531622364, 'loss': 8.002336502075195}


EP_train:0:  74%|| 1101/1490 [1:14:11<26:50,  4.14s/it]

{'epoch': 0, 'iter': 1100, 'mode': 'train', 'avg_loss': 8.064415552310788, 'avg_nsp_loss': 0.7156832308037296, 'avg_mlm_loss': 7.348732323780805, 'avg_acc': 50.300862851952765, 'loss': 7.457376956939697}


EP_train:0:  75%|| 1111/1490 [1:14:49<24:46,  3.92s/it]

{'epoch': 0, 'iter': 1110, 'mode': 'train', 'avg_loss': 8.062682004150885, 'avg_nsp_loss': 0.7155150339798708, 'avg_mlm_loss': 7.347166972585244, 'avg_acc': 50.28971647164716, 'loss': 7.846954822540283}


EP_train:0:  75%|| 1121/1490 [1:15:30<26:14,  4.27s/it]

{'epoch': 0, 'iter': 1120, 'mode': 'train', 'avg_loss': 8.060790620033067, 'avg_nsp_loss': 0.715381791451273, 'avg_mlm_loss': 7.345408830974487, 'avg_acc': 50.292707404103474, 'loss': 7.982665061950684}


EP_train:0:  76%|| 1131/1490 [1:16:09<23:59,  4.01s/it]

{'epoch': 0, 'iter': 1130, 'mode': 'train', 'avg_loss': 8.059502327157585, 'avg_nsp_loss': 0.715196645122835, 'avg_mlm_loss': 7.3443056843535865, 'avg_acc': 50.28183023872679, 'loss': 7.872183799743652}


EP_train:0:  77%|| 1141/1490 [1:16:52<24:49,  4.27s/it]

{'epoch': 0, 'iter': 1140, 'mode': 'train', 'avg_loss': 8.053478214638364, 'avg_nsp_loss': 0.7150799813341614, 'avg_mlm_loss': 7.338398235916151, 'avg_acc': 50.28757668711656, 'loss': 7.920358657836914}


EP_train:0:  77%|| 1151/1490 [1:17:32<22:50,  4.04s/it]

{'epoch': 0, 'iter': 1150, 'mode': 'train', 'avg_loss': 8.053838434476214, 'avg_nsp_loss': 0.7148852215757793, 'avg_mlm_loss': 7.338953215800401, 'avg_acc': 50.285078192875766, 'loss': 8.230039596557617}


EP_train:0:  78%|| 1161/1490 [1:18:12<22:03,  4.02s/it]

{'epoch': 0, 'iter': 1160, 'mode': 'train', 'avg_loss': 8.051883307332934, 'avg_nsp_loss': 0.7147281556474454, 'avg_mlm_loss': 7.3371551548171725, 'avg_acc': 50.30684754521963, 'loss': 7.727385997772217}


EP_train:0:  79%|| 1171/1490 [1:18:51<21:59,  4.14s/it]

{'epoch': 0, 'iter': 1170, 'mode': 'train', 'avg_loss': 8.048180620452463, 'avg_nsp_loss': 0.714676168258123, 'avg_mlm_loss': 7.33350445529928, 'avg_acc': 50.31757045260461, 'loss': 7.892880439758301}


EP_train:0:  79%|| 1181/1490 [1:19:31<19:14,  3.74s/it]

{'epoch': 0, 'iter': 1180, 'mode': 'train', 'avg_loss': 8.046950496928758, 'avg_nsp_loss': 0.7144791872495317, 'avg_mlm_loss': 7.332471313767267, 'avg_acc': 50.35192633361558, 'loss': 8.413911819458008}


EP_train:0:  80%|| 1191/1490 [1:20:13<21:05,  4.23s/it]

{'epoch': 0, 'iter': 1190, 'mode': 'train', 'avg_loss': 8.044784206787545, 'avg_nsp_loss': 0.7143613733632739, 'avg_mlm_loss': 7.330422837427942, 'avg_acc': 50.33585222502099, 'loss': 7.858823776245117}


EP_train:0:  81%|| 1201/1490 [1:20:52<18:52,  3.92s/it]

{'epoch': 0, 'iter': 1200, 'mode': 'train', 'avg_loss': 8.042398665171678, 'avg_nsp_loss': 0.7142094509686955, 'avg_mlm_loss': 7.32818921857035, 'avg_acc': 50.320045795170685, 'loss': 8.430456161499023}


EP_train:0:  81%|| 1211/1490 [1:21:31<18:06,  3.89s/it]

{'epoch': 0, 'iter': 1210, 'mode': 'train', 'avg_loss': 8.040136945631561, 'avg_nsp_loss': 0.7140383058889753, 'avg_mlm_loss': 7.326098643680135, 'avg_acc': 50.34320809248555, 'loss': 7.6659955978393555}


EP_train:0:  82%|| 1221/1490 [1:22:10<17:55,  4.00s/it]

{'epoch': 0, 'iter': 1220, 'mode': 'train', 'avg_loss': 8.038670480495394, 'avg_nsp_loss': 0.7138758425728207, 'avg_mlm_loss': 7.324794641193262, 'avg_acc': 50.345515970515976, 'loss': 7.783074855804443}


EP_train:0:  83%|| 1231/1490 [1:22:51<16:55,  3.92s/it]

{'epoch': 0, 'iter': 1230, 'mode': 'train', 'avg_loss': 8.03598376621956, 'avg_nsp_loss': 0.7138037067617855, 'avg_mlm_loss': 7.3221800631376714, 'avg_acc': 50.33763200649878, 'loss': 8.040650367736816}


EP_train:0:  83%|| 1241/1490 [1:23:31<16:50,  4.06s/it]

{'epoch': 0, 'iter': 1240, 'mode': 'train', 'avg_loss': 8.033093857438597, 'avg_nsp_loss': 0.7136467065569089, 'avg_mlm_loss': 7.319447154676021, 'avg_acc': 50.35757453666398, 'loss': 7.900267601013184}


EP_train:0:  84%|| 1251/1490 [1:24:16<17:56,  4.51s/it]

{'epoch': 0, 'iter': 1250, 'mode': 'train', 'avg_loss': 8.030408154479224, 'avg_nsp_loss': 0.7135568665181228, 'avg_mlm_loss': 7.316851291534522, 'avg_acc': 50.32723820943246, 'loss': 7.531938076019287}


EP_train:0:  85%|| 1261/1490 [1:24:58<16:13,  4.25s/it]

{'epoch': 0, 'iter': 1260, 'mode': 'train', 'avg_loss': 8.026293701259604, 'avg_nsp_loss': 0.7134090075882346, 'avg_mlm_loss': 7.312884697547326, 'avg_acc': 50.33455590800951, 'loss': 7.733214378356934}


EP_train:0:  85%|| 1271/1490 [1:25:35<13:31,  3.71s/it]

{'epoch': 0, 'iter': 1270, 'mode': 'train', 'avg_loss': 8.025129296477623, 'avg_nsp_loss': 0.7132441491990649, 'avg_mlm_loss': 7.311885152061937, 'avg_acc': 50.35405192761605, 'loss': 7.696977615356445}


EP_train:0:  86%|| 1281/1490 [1:26:18<14:12,  4.08s/it]

{'epoch': 0, 'iter': 1280, 'mode': 'train', 'avg_loss': 8.022081781233966, 'avg_nsp_loss': 0.7131010544756071, 'avg_mlm_loss': 7.308980731830106, 'avg_acc': 50.31469555035129, 'loss': 7.68187952041626}


EP_train:0:  87%|| 1291/1490 [1:26:58<13:51,  4.18s/it]

{'epoch': 0, 'iter': 1290, 'mode': 'train', 'avg_loss': 8.018245082784308, 'avg_nsp_loss': 0.7129339177787627, 'avg_mlm_loss': 7.305311170130345, 'avg_acc': 50.31951975213013, 'loss': 7.441717624664307}


EP_train:0:  87%|| 1301/1490 [1:27:37<12:06,  3.85s/it]

{'epoch': 0, 'iter': 1300, 'mode': 'train', 'avg_loss': 8.015371519817378, 'avg_nsp_loss': 0.7128292740903205, 'avg_mlm_loss': 7.302542250675023, 'avg_acc': 50.341083781706374, 'loss': 7.740097999572754}


EP_train:0:  88%|| 1311/1490 [1:28:17<11:37,  3.90s/it]

{'epoch': 0, 'iter': 1310, 'mode': 'train', 'avg_loss': 8.011661388053865, 'avg_nsp_loss': 0.7126914173660162, 'avg_mlm_loss': 7.298969975143421, 'avg_acc': 50.345633104500386, 'loss': 8.014546394348145}


EP_train:0:  89%|| 1321/1490 [1:28:57<11:26,  4.06s/it]

{'epoch': 0, 'iter': 1320, 'mode': 'train', 'avg_loss': 8.00911762712581, 'avg_nsp_loss': 0.7125611825931443, 'avg_mlm_loss': 7.296556448593544, 'avg_acc': 50.331188493565485, 'loss': 7.968826770782471}


EP_train:0:  89%|| 1331/1490 [1:29:37<10:34,  3.99s/it]

{'epoch': 0, 'iter': 1330, 'mode': 'train', 'avg_loss': 8.006137367480685, 'avg_nsp_loss': 0.7124411642864895, 'avg_mlm_loss': 7.29369620717978, 'avg_acc': 50.321656649135996, 'loss': 7.832253932952881}


EP_train:0:  90%|| 1341/1490 [1:30:15<08:57,  3.61s/it]

{'epoch': 0, 'iter': 1340, 'mode': 'train', 'avg_loss': 8.00326764130219, 'avg_nsp_loss': 0.712414647819923, 'avg_mlm_loss': 7.2908529979270575, 'avg_acc': 50.270320656226694, 'loss': 8.028608322143555}


EP_train:0:  91%|| 1351/1490 [1:30:54<08:55,  3.85s/it]

{'epoch': 0, 'iter': 1350, 'mode': 'train', 'avg_loss': 8.001629816347542, 'avg_nsp_loss': 0.7123331888175558, 'avg_mlm_loss': 7.289296631633044, 'avg_acc': 50.25675425610658, 'loss': 7.090246200561523}


EP_train:0:  91%|| 1361/1490 [1:31:35<10:14,  4.76s/it]

{'epoch': 0, 'iter': 1360, 'mode': 'train', 'avg_loss': 7.9979075541135405, 'avg_nsp_loss': 0.712399627862828, 'avg_mlm_loss': 7.28550793019224, 'avg_acc': 50.19287288758266, 'loss': 6.011780738830566}


EP_train:0:  92%|| 1371/1490 [1:32:17<07:47,  3.93s/it]

{'epoch': 0, 'iter': 1370, 'mode': 'train', 'avg_loss': 7.995744158802547, 'avg_nsp_loss': 0.7122839803594816, 'avg_mlm_loss': 7.283460183007973, 'avg_acc': 50.16639314369073, 'loss': 7.901341438293457}


EP_train:0:  93%|| 1381/1490 [1:32:58<07:00,  3.86s/it]

{'epoch': 0, 'iter': 1380, 'mode': 'train', 'avg_loss': 7.994323756019764, 'avg_nsp_loss': 0.7121480853552408, 'avg_mlm_loss': 7.28217567558482, 'avg_acc': 50.15387400434468, 'loss': 8.137290000915527}


EP_train:0:  93%|| 1391/1490 [1:33:39<06:26,  3.90s/it]

{'epoch': 0, 'iter': 1390, 'mode': 'train', 'avg_loss': 7.992812164561685, 'avg_nsp_loss': 0.7120515056688266, 'avg_mlm_loss': 7.2807606639491835, 'avg_acc': 50.116822429906534, 'loss': 7.9925618171691895}


EP_train:0:  94%|| 1401/1490 [1:34:21<05:52,  3.97s/it]

{'epoch': 0, 'iter': 1400, 'mode': 'train', 'avg_loss': 7.990674335389882, 'avg_nsp_loss': 0.7119175553492015, 'avg_mlm_loss': 7.278756784933282, 'avg_acc': 50.13160242683797, 'loss': 8.037071228027344}


EP_train:0:  95%|| 1411/1490 [1:35:02<05:22,  4.08s/it]

{'epoch': 0, 'iter': 1410, 'mode': 'train', 'avg_loss': 7.9892737049316365, 'avg_nsp_loss': 0.711759643044394, 'avg_mlm_loss': 7.277514065689098, 'avg_acc': 50.16167611622963, 'loss': 7.681088924407959}


EP_train:0:  95%|| 1421/1490 [1:35:43<04:51,  4.23s/it]

{'epoch': 0, 'iter': 1420, 'mode': 'train', 'avg_loss': 7.988546100323843, 'avg_nsp_loss': 0.7116515944324523, 'avg_mlm_loss': 7.276894509330926, 'avg_acc': 50.160538353272344, 'loss': 8.243896484375}


EP_train:0:  96%|| 1431/1490 [1:36:24<04:07,  4.20s/it]

{'epoch': 0, 'iter': 1430, 'mode': 'train', 'avg_loss': 7.98718613332006, 'avg_nsp_loss': 0.7115103386233188, 'avg_mlm_loss': 7.275675797945632, 'avg_acc': 50.18780573025856, 'loss': 8.169189453125}


EP_train:0:  97%|| 1441/1490 [1:37:06<03:18,  4.04s/it]

{'epoch': 0, 'iter': 1440, 'mode': 'train', 'avg_loss': 7.985202838280895, 'avg_nsp_loss': 0.7114425193179738, 'avg_mlm_loss': 7.273760322396082, 'avg_acc': 50.1734906315059, 'loss': 7.828579425811768}


EP_train:0:  97%|| 1451/1490 [1:37:50<02:43,  4.20s/it]

{'epoch': 0, 'iter': 1450, 'mode': 'train', 'avg_loss': 7.983973479780306, 'avg_nsp_loss': 0.7113343119867747, 'avg_mlm_loss': 7.272639171408424, 'avg_acc': 50.18737077877326, 'loss': 7.908060550689697}


EP_train:0:  98%|| 1461/1490 [1:38:28<01:49,  3.78s/it]

{'epoch': 0, 'iter': 1460, 'mode': 'train', 'avg_loss': 7.982829073697716, 'avg_nsp_loss': 0.7112666091063837, 'avg_mlm_loss': 7.271562468589454, 'avg_acc': 50.154004106776185, 'loss': 7.779096603393555}


EP_train:0:  99%|| 1471/1490 [1:39:12<01:21,  4.29s/it]

{'epoch': 0, 'iter': 1470, 'mode': 'train', 'avg_loss': 7.980262965754049, 'avg_nsp_loss': 0.7111244164787126, 'avg_mlm_loss': 7.269138553367838, 'avg_acc': 50.165703602991165, 'loss': 7.9757561683654785}


EP_train:0:  99%|| 1481/1490 [1:39:51<00:35,  3.90s/it]

{'epoch': 0, 'iter': 1480, 'mode': 'train', 'avg_loss': 7.978214273897077, 'avg_nsp_loss': 0.7111185853502221, 'avg_mlm_loss': 7.267095692973939, 'avg_acc': 50.162474679270765, 'loss': 7.799394130706787}


EP_train:0: 100%|| 1490/1490 [1:40:28<00:00,  4.05s/it]


EP0, train: avg_loss=7.9767, nsp_loss=0.7111, mlm_loss=7.2656, total_acc=50.12
Saved model after epoch 0 to /content/drive/MyDrive/email_bert/checkpoints/bert_email_epoch_0.pth


EP_train:1:   0%|| 1/1490 [00:04<1:54:12,  4.60s/it]

{'epoch': 1, 'iter': 0, 'mode': 'train', 'avg_loss': 7.2799506187438965, 'avg_nsp_loss': 0.6962209939956665, 'avg_mlm_loss': 6.5837297439575195, 'avg_acc': 46.875, 'loss': 7.2799506187438965}


EP_train:1:   1%|| 11/1490 [00:44<1:36:21,  3.91s/it]

{'epoch': 1, 'iter': 10, 'mode': 'train', 'avg_loss': 7.641994953155518, 'avg_nsp_loss': 0.6916472749276594, 'avg_mlm_loss': 6.95034764029763, 'avg_acc': 53.40909090909091, 'loss': 7.6151347160339355}


EP_train:1:   1%|| 21/1490 [01:26<1:38:00,  4.00s/it]

{'epoch': 1, 'iter': 20, 'mode': 'train', 'avg_loss': 7.684260413760231, 'avg_nsp_loss': 0.6937200682503837, 'avg_mlm_loss': 6.990540322803316, 'avg_acc': 51.33928571428571, 'loss': 7.549717903137207}


EP_train:1:   2%|| 31/1490 [02:08<1:49:13,  4.49s/it]

{'epoch': 1, 'iter': 30, 'mode': 'train', 'avg_loss': 7.6790859314703175, 'avg_nsp_loss': 0.6943179003653988, 'avg_mlm_loss': 6.984768006109422, 'avg_acc': 52.11693548387096, 'loss': 7.226505279541016}


EP_train:1:   3%|| 41/1490 [02:49<1:48:34,  4.50s/it]

{'epoch': 1, 'iter': 40, 'mode': 'train', 'avg_loss': 7.695189976110691, 'avg_nsp_loss': 0.6971368193626404, 'avg_mlm_loss': 6.998053155294278, 'avg_acc': 51.829268292682926, 'loss': 7.837166786193848}


EP_train:1:   3%|| 51/1490 [03:32<1:41:14,  4.22s/it]

{'epoch': 1, 'iter': 50, 'mode': 'train', 'avg_loss': 7.679361034842098, 'avg_nsp_loss': 0.6970714248862921, 'avg_mlm_loss': 6.982289604112213, 'avg_acc': 51.71568627450981, 'loss': 8.017770767211914}


EP_train:1:   4%|| 61/1490 [04:12<1:37:18,  4.09s/it]

{'epoch': 1, 'iter': 60, 'mode': 'train', 'avg_loss': 7.689735498584684, 'avg_nsp_loss': 0.6972145858358164, 'avg_mlm_loss': 6.992520895160612, 'avg_acc': 51.127049180327866, 'loss': 7.815644264221191}


EP_train:1:   5%|| 71/1490 [04:54<1:39:37,  4.21s/it]

{'epoch': 1, 'iter': 70, 'mode': 'train', 'avg_loss': 7.677981269191688, 'avg_nsp_loss': 0.6974588321967864, 'avg_mlm_loss': 6.9805224176863545, 'avg_acc': 50.39612676056338, 'loss': 7.215106010437012}


EP_train:1:   5%|| 81/1490 [05:34<1:33:09,  3.97s/it]

{'epoch': 1, 'iter': 80, 'mode': 'train', 'avg_loss': 7.697216911080443, 'avg_nsp_loss': 0.6971254341396285, 'avg_mlm_loss': 7.000091452657441, 'avg_acc': 50.308641975308646, 'loss': 7.8094024658203125}


EP_train:1:   6%|| 91/1490 [06:15<1:37:40,  4.19s/it]

{'epoch': 1, 'iter': 90, 'mode': 'train', 'avg_loss': 7.708625631017999, 'avg_nsp_loss': 0.6967564650944301, 'avg_mlm_loss': 7.0118691475836785, 'avg_acc': 50.1717032967033, 'loss': 7.843398571014404}


EP_train:1:   7%|| 101/1490 [06:55<1:32:56,  4.02s/it]

{'epoch': 1, 'iter': 100, 'mode': 'train', 'avg_loss': 7.710672945079237, 'avg_nsp_loss': 0.6966782519132784, 'avg_mlm_loss': 7.013994684313784, 'avg_acc': 49.87623762376238, 'loss': 8.012069702148438}


EP_train:1:   7%|| 111/1490 [07:37<1:40:53,  4.39s/it]

{'epoch': 1, 'iter': 110, 'mode': 'train', 'avg_loss': 7.715183631793873, 'avg_nsp_loss': 0.6968263178258329, 'avg_mlm_loss': 7.018357311283146, 'avg_acc': 49.83108108108108, 'loss': 8.45879077911377}


EP_train:1:   8%|| 121/1490 [08:17<1:30:03,  3.95s/it]

{'epoch': 1, 'iter': 120, 'mode': 'train', 'avg_loss': 7.70875390502047, 'avg_nsp_loss': 0.6965624488089696, 'avg_mlm_loss': 7.012191453255897, 'avg_acc': 49.56095041322314, 'loss': 7.603359222412109}


EP_train:1:   9%|| 131/1490 [08:57<1:27:01,  3.84s/it]

{'epoch': 1, 'iter': 130, 'mode': 'train', 'avg_loss': 7.706598718657748, 'avg_nsp_loss': 0.6959579482333351, 'avg_mlm_loss': 7.010640766784435, 'avg_acc': 50.0, 'loss': 7.548756122589111}


EP_train:1:   9%|| 141/1490 [09:39<1:34:29,  4.20s/it]

{'epoch': 1, 'iter': 140, 'mode': 'train', 'avg_loss': 7.694792791461269, 'avg_nsp_loss': 0.6958487570708525, 'avg_mlm_loss': 6.998944032276776, 'avg_acc': 50.33244680851063, 'loss': 7.8466691970825195}


EP_train:1:  10%|| 151/1490 [10:22<1:35:45,  4.29s/it]

{'epoch': 1, 'iter': 150, 'mode': 'train', 'avg_loss': 7.682783196304018, 'avg_nsp_loss': 0.6956925253994417, 'avg_mlm_loss': 6.9870906728782405, 'avg_acc': 50.70364238410596, 'loss': 7.86317253112793}


EP_train:1:  11%|| 161/1490 [11:03<1:34:16,  4.26s/it]

{'epoch': 1, 'iter': 160, 'mode': 'train', 'avg_loss': 7.682479197934548, 'avg_nsp_loss': 0.6957416030931176, 'avg_mlm_loss': 6.986737600764873, 'avg_acc': 50.77639751552795, 'loss': 8.09437084197998}


EP_train:1:  11%|| 171/1490 [11:43<1:29:41,  4.08s/it]

{'epoch': 1, 'iter': 170, 'mode': 'train', 'avg_loss': 7.693061792362503, 'avg_nsp_loss': 0.6954103579298098, 'avg_mlm_loss': 6.99765143756978, 'avg_acc': 50.93201754385965, 'loss': 8.356456756591797}


EP_train:1:  12%|| 181/1490 [12:22<1:28:42,  4.07s/it]

{'epoch': 1, 'iter': 180, 'mode': 'train', 'avg_loss': 7.702050061515681, 'avg_nsp_loss': 0.6953384313135516, 'avg_mlm_loss': 7.006711633165897, 'avg_acc': 50.82872928176796, 'loss': 7.276739120483398}


EP_train:1:  13%|| 191/1490 [13:05<1:34:46,  4.38s/it]

{'epoch': 1, 'iter': 190, 'mode': 'train', 'avg_loss': 7.695373410329768, 'avg_nsp_loss': 0.6957103878415692, 'avg_mlm_loss': 6.999663020927868, 'avg_acc': 50.572643979057595, 'loss': 7.51889181137085}


EP_train:1:  13%|| 201/1490 [13:45<1:26:39,  4.03s/it]

{'epoch': 1, 'iter': 200, 'mode': 'train', 'avg_loss': 7.698170436555474, 'avg_nsp_loss': 0.6957031461729932, 'avg_mlm_loss': 7.002467288306696, 'avg_acc': 50.66853233830846, 'loss': 7.932013988494873}


EP_train:1:  14%|| 211/1490 [14:26<1:21:17,  3.81s/it]

{'epoch': 1, 'iter': 210, 'mode': 'train', 'avg_loss': 7.697659020175301, 'avg_nsp_loss': 0.6956555179510071, 'avg_mlm_loss': 7.002003500246889, 'avg_acc': 50.72571090047393, 'loss': 8.377336502075195}


EP_train:1:  15%|| 221/1490 [15:06<1:21:13,  3.84s/it]

{'epoch': 1, 'iter': 220, 'mode': 'train', 'avg_loss': 7.704776116625756, 'avg_nsp_loss': 0.6955739969042092, 'avg_mlm_loss': 7.009202115675983, 'avg_acc': 50.664592760180994, 'loss': 8.151809692382812}


EP_train:1:  16%|| 231/1490 [15:49<1:22:06,  3.91s/it]

{'epoch': 1, 'iter': 230, 'mode': 'train', 'avg_loss': 7.697232397087725, 'avg_nsp_loss': 0.6954177249045599, 'avg_mlm_loss': 7.00181466676456, 'avg_acc': 50.89285714285714, 'loss': 8.101005554199219}


EP_train:1:  16%|| 241/1490 [16:30<1:27:13,  4.19s/it]

{'epoch': 1, 'iter': 240, 'mode': 'train', 'avg_loss': 7.70101157659317, 'avg_nsp_loss': 0.6954148876716487, 'avg_mlm_loss': 7.005596685211688, 'avg_acc': 50.8558091286307, 'loss': 7.9555511474609375}


EP_train:1:  17%|| 251/1490 [17:09<1:22:03,  3.97s/it]

{'epoch': 1, 'iter': 250, 'mode': 'train', 'avg_loss': 7.7062247986812515, 'avg_nsp_loss': 0.6952285873462479, 'avg_mlm_loss': 7.0109962087228475, 'avg_acc': 50.87151394422311, 'loss': 7.978593349456787}


EP_train:1:  18%|| 261/1490 [17:52<1:25:11,  4.16s/it]

{'epoch': 1, 'iter': 260, 'mode': 'train', 'avg_loss': 7.699912480467581, 'avg_nsp_loss': 0.6947264029605179, 'avg_mlm_loss': 7.005186077278693, 'avg_acc': 51.11350574712644, 'loss': 7.938185214996338}


EP_train:1:  18%|| 271/1490 [18:34<1:34:19,  4.64s/it]

{'epoch': 1, 'iter': 270, 'mode': 'train', 'avg_loss': 7.694711674623384, 'avg_nsp_loss': 0.6950161879352977, 'avg_mlm_loss': 6.999695485808313, 'avg_acc': 51.107011070110694, 'loss': 5.713407039642334}


EP_train:1:  19%|| 281/1490 [19:14<1:19:22,  3.94s/it]

{'epoch': 1, 'iter': 280, 'mode': 'train', 'avg_loss': 7.698144544486049, 'avg_nsp_loss': 0.695065549896281, 'avg_mlm_loss': 7.003078995226117, 'avg_acc': 51.07873665480427, 'loss': 7.653027057647705}


EP_train:1:  20%|| 291/1490 [19:55<1:20:00,  4.00s/it]

{'epoch': 1, 'iter': 290, 'mode': 'train', 'avg_loss': 7.704657498913532, 'avg_nsp_loss': 0.6951993372022491, 'avg_mlm_loss': 7.009458161711283, 'avg_acc': 51.02018900343642, 'loss': 8.074572563171387}


EP_train:1:  20%|| 301/1490 [20:34<1:17:55,  3.93s/it]

{'epoch': 1, 'iter': 300, 'mode': 'train', 'avg_loss': 7.708389329751861, 'avg_nsp_loss': 0.6952246688925151, 'avg_mlm_loss': 7.013164662839566, 'avg_acc': 50.92400332225914, 'loss': 7.832531929016113}


EP_train:1:  21%|| 311/1490 [21:18<1:25:10,  4.33s/it]

{'epoch': 1, 'iter': 310, 'mode': 'train', 'avg_loss': 7.703882864433853, 'avg_nsp_loss': 0.6951241845869941, 'avg_mlm_loss': 7.008758679846859, 'avg_acc': 51.01487138263665, 'loss': 7.749703884124756}


EP_train:1:  22%|| 321/1490 [22:04<1:22:06,  4.21s/it]

{'epoch': 1, 'iter': 320, 'mode': 'train', 'avg_loss': 7.7004494964148025, 'avg_nsp_loss': 0.6950859995645897, 'avg_mlm_loss': 7.005363500006845, 'avg_acc': 51.041666666666664, 'loss': 7.603320121765137}


EP_train:1:  22%|| 331/1490 [22:43<1:20:33,  4.17s/it]

{'epoch': 1, 'iter': 330, 'mode': 'train', 'avg_loss': 7.703929934256747, 'avg_nsp_loss': 0.6951283437968021, 'avg_mlm_loss': 7.0088015942415085, 'avg_acc': 50.84025679758308, 'loss': 7.576634407043457}


EP_train:1:  23%|| 341/1490 [23:23<1:15:23,  3.94s/it]

{'epoch': 1, 'iter': 340, 'mode': 'train', 'avg_loss': 7.705845947489361, 'avg_nsp_loss': 0.6951253200905764, 'avg_mlm_loss': 7.01072063054507, 'avg_acc': 50.86143695014663, 'loss': 7.76578950881958}


EP_train:1:  24%|| 351/1490 [24:02<1:17:56,  4.11s/it]

{'epoch': 1, 'iter': 350, 'mode': 'train', 'avg_loss': 7.706769558778856, 'avg_nsp_loss': 0.6950743653495767, 'avg_mlm_loss': 7.0116951961463, 'avg_acc': 50.89031339031339, 'loss': 7.320319652557373}


EP_train:1:  24%|| 361/1490 [24:41<1:16:40,  4.07s/it]

{'epoch': 1, 'iter': 360, 'mode': 'train', 'avg_loss': 7.705439819853722, 'avg_nsp_loss': 0.6949950081819973, 'avg_mlm_loss': 7.010444812827493, 'avg_acc': 50.93490304709142, 'loss': 7.1209259033203125}


EP_train:1:  25%|| 371/1490 [25:22<1:19:29,  4.26s/it]

{'epoch': 1, 'iter': 370, 'mode': 'train', 'avg_loss': 7.704559704042831, 'avg_nsp_loss': 0.6949559891962941, 'avg_mlm_loss': 7.009603716292471, 'avg_acc': 50.80862533692723, 'loss': 7.763815402984619}


EP_train:1:  26%|| 381/1490 [26:04<1:16:37,  4.15s/it]

{'epoch': 1, 'iter': 380, 'mode': 'train', 'avg_loss': 7.700941034502245, 'avg_nsp_loss': 0.6948007690937813, 'avg_mlm_loss': 7.006140268380873, 'avg_acc': 50.95964566929134, 'loss': 7.56396484375}


EP_train:1:  26%|| 391/1490 [26:47<1:19:27,  4.34s/it]

{'epoch': 1, 'iter': 390, 'mode': 'train', 'avg_loss': 7.701434197633163, 'avg_nsp_loss': 0.695019895615785, 'avg_mlm_loss': 7.0064143061332995, 'avg_acc': 50.87116368286445, 'loss': 7.786346435546875}


EP_train:1:  27%|| 401/1490 [27:27<1:12:31,  4.00s/it]

{'epoch': 1, 'iter': 400, 'mode': 'train', 'avg_loss': 7.703444580782084, 'avg_nsp_loss': 0.6949658405751065, 'avg_mlm_loss': 7.008478743774338, 'avg_acc': 50.89619700748129, 'loss': 7.807486534118652}


EP_train:1:  28%|| 411/1490 [28:07<1:13:06,  4.07s/it]

{'epoch': 1, 'iter': 410, 'mode': 'train', 'avg_loss': 7.706623076232391, 'avg_nsp_loss': 0.6950806005157694, 'avg_mlm_loss': 7.0115424791971845, 'avg_acc': 50.790754257907544, 'loss': 7.258141994476318}


EP_train:1:  28%|| 421/1490 [28:46<1:08:20,  3.84s/it]

{'epoch': 1, 'iter': 420, 'mode': 'train', 'avg_loss': 7.709612832782954, 'avg_nsp_loss': 0.69527987374829, 'avg_mlm_loss': 7.01433296271571, 'avg_acc': 50.786817102137775, 'loss': 7.989120006561279}


EP_train:1:  29%|| 431/1490 [29:26<1:09:40,  3.95s/it]

{'epoch': 1, 'iter': 430, 'mode': 'train', 'avg_loss': 7.707602602699793, 'avg_nsp_loss': 0.6952583023677569, 'avg_mlm_loss': 7.0123443017149745, 'avg_acc': 50.790313225058, 'loss': 8.053263664245605}


EP_train:1:  30%|| 441/1490 [30:06<1:04:43,  3.70s/it]

{'epoch': 1, 'iter': 440, 'mode': 'train', 'avg_loss': 7.708995917486766, 'avg_nsp_loss': 0.6952338530903771, 'avg_mlm_loss': 7.013762065342495, 'avg_acc': 50.69444444444444, 'loss': 8.217060089111328}


EP_train:1:  30%|| 451/1490 [30:48<1:14:45,  4.32s/it]

{'epoch': 1, 'iter': 450, 'mode': 'train', 'avg_loss': 7.705923674640529, 'avg_nsp_loss': 0.6952005819047898, 'avg_mlm_loss': 7.010723091810611, 'avg_acc': 50.658259423503324, 'loss': 7.119131565093994}


EP_train:1:  31%|| 461/1490 [31:27<1:09:24,  4.05s/it]

{'epoch': 1, 'iter': 460, 'mode': 'train', 'avg_loss': 7.704170445299459, 'avg_nsp_loss': 0.6952104331872986, 'avg_mlm_loss': 7.0089600112071, 'avg_acc': 50.63042299349241, 'loss': 7.514606952667236}


EP_train:1:  32%|| 471/1490 [32:09<1:08:54,  4.06s/it]

{'epoch': 1, 'iter': 470, 'mode': 'train', 'avg_loss': 7.703164448910205, 'avg_nsp_loss': 0.6951890826984576, 'avg_mlm_loss': 7.007975365705551, 'avg_acc': 50.62367303609342, 'loss': 7.928142547607422}


EP_train:1:  32%|| 481/1490 [32:48<1:03:20,  3.77s/it]

{'epoch': 1, 'iter': 480, 'mode': 'train', 'avg_loss': 7.703787187033036, 'avg_nsp_loss': 0.6952123152490961, 'avg_mlm_loss': 7.008574871412186, 'avg_acc': 50.60420997920998, 'loss': 7.272630214691162}


EP_train:1:  33%|| 491/1490 [33:30<1:10:21,  4.23s/it]

{'epoch': 1, 'iter': 490, 'mode': 'train', 'avg_loss': 7.6979410255028125, 'avg_nsp_loss': 0.6952335163192205, 'avg_mlm_loss': 7.00270750857662, 'avg_acc': 50.61099796334012, 'loss': 7.598598957061768}


EP_train:1:  34%|| 501/1490 [34:10<1:01:50,  3.75s/it]

{'epoch': 1, 'iter': 500, 'mode': 'train', 'avg_loss': 7.69647900929708, 'avg_nsp_loss': 0.6952879952337452, 'avg_mlm_loss': 7.001191013587449, 'avg_acc': 50.561377245508986, 'loss': 7.806161403656006}


EP_train:1:  34%|| 511/1490 [34:51<1:06:35,  4.08s/it]

{'epoch': 1, 'iter': 510, 'mode': 'train', 'avg_loss': 7.693350299230527, 'avg_nsp_loss': 0.695292332051085, 'avg_mlm_loss': 6.998057967296085, 'avg_acc': 50.580968688845395, 'loss': 7.528145790100098}


EP_train:1:  35%|| 521/1490 [35:29<1:06:20,  4.11s/it]

{'epoch': 1, 'iter': 520, 'mode': 'train', 'avg_loss': 7.695034689729365, 'avg_nsp_loss': 0.6952666833624959, 'avg_mlm_loss': 6.999768005451634, 'avg_acc': 50.61180422264875, 'loss': 8.116707801818848}


EP_train:1:  36%|| 531/1490 [36:09<1:01:59,  3.88s/it]

{'epoch': 1, 'iter': 530, 'mode': 'train', 'avg_loss': 7.693136022140302, 'avg_nsp_loss': 0.695245918693291, 'avg_mlm_loss': 6.997890101538764, 'avg_acc': 50.606167608286256, 'loss': 7.948206424713135}


EP_train:1:  36%|| 541/1490 [36:50<1:04:07,  4.05s/it]

{'epoch': 1, 'iter': 540, 'mode': 'train', 'avg_loss': 7.692019805449875, 'avg_nsp_loss': 0.6952498523232676, 'avg_mlm_loss': 6.996769950041709, 'avg_acc': 50.57763401109058, 'loss': 7.6936445236206055}


EP_train:1:  37%|| 551/1490 [37:30<1:02:17,  3.98s/it]

{'epoch': 1, 'iter': 550, 'mode': 'train', 'avg_loss': 7.694643430830995, 'avg_nsp_loss': 0.6952114437323084, 'avg_mlm_loss': 6.999431983961601, 'avg_acc': 50.61252268602541, 'loss': 7.997581481933594}


EP_train:1:  38%|| 561/1490 [38:10<1:02:45,  4.05s/it]

{'epoch': 1, 'iter': 560, 'mode': 'train', 'avg_loss': 7.691784274769339, 'avg_nsp_loss': 0.6952054039467253, 'avg_mlm_loss': 6.9965788676352, 'avg_acc': 50.52361853832442, 'loss': 7.161801815032959}


EP_train:1:  38%|| 571/1490 [38:52<1:03:35,  4.15s/it]

{'epoch': 1, 'iter': 570, 'mode': 'train', 'avg_loss': 7.690358738974389, 'avg_nsp_loss': 0.6951638143868455, 'avg_mlm_loss': 6.995194921142792, 'avg_acc': 50.525394045534156, 'loss': 7.671873092651367}


EP_train:1:  39%|| 581/1490 [39:29<58:03,  3.83s/it]

{'epoch': 1, 'iter': 580, 'mode': 'train', 'avg_loss': 7.692248651369919, 'avg_nsp_loss': 0.6951528492154435, 'avg_mlm_loss': 6.997095799076783, 'avg_acc': 50.54862306368331, 'loss': 7.769173622131348}


EP_train:1:  40%|| 591/1490 [40:06<57:20,  3.83s/it]

{'epoch': 1, 'iter': 590, 'mode': 'train', 'avg_loss': 7.694245971802326, 'avg_nsp_loss': 0.6951415516599988, 'avg_mlm_loss': 6.999104416713295, 'avg_acc': 50.53405245346869, 'loss': 7.323993682861328}


EP_train:1:  40%|| 601/1490 [40:45<1:00:22,  4.08s/it]

{'epoch': 1, 'iter': 600, 'mode': 'train', 'avg_loss': 7.695452045878634, 'avg_nsp_loss': 0.6951059975758963, 'avg_mlm_loss': 7.000346045724167, 'avg_acc': 50.53036605657238, 'loss': 8.058143615722656}


EP_train:1:  41%|| 611/1490 [41:26<59:48,  4.08s/it]

{'epoch': 1, 'iter': 610, 'mode': 'train', 'avg_loss': 7.6974973936127755, 'avg_nsp_loss': 0.6949697343690501, 'avg_mlm_loss': 7.002527658365752, 'avg_acc': 50.618862520458265, 'loss': 7.536557674407959}


EP_train:1:  42%|| 621/1490 [42:02<54:26,  3.76s/it]

{'epoch': 1, 'iter': 620, 'mode': 'train', 'avg_loss': 7.699802482377696, 'avg_nsp_loss': 0.6951514201271745, 'avg_mlm_loss': 7.004651060426869, 'avg_acc': 50.56360708534622, 'loss': 7.646530628204346}


EP_train:1:  42%|| 631/1490 [42:40<55:26,  3.87s/it]

{'epoch': 1, 'iter': 630, 'mode': 'train', 'avg_loss': 7.702101197371204, 'avg_nsp_loss': 0.6949605736792938, 'avg_mlm_loss': 7.007140623597449, 'avg_acc': 50.628961965134714, 'loss': 8.124626159667969}


EP_train:1:  43%|| 641/1490 [43:21<1:00:46,  4.30s/it]

{'epoch': 1, 'iter': 640, 'mode': 'train', 'avg_loss': 7.700339080762937, 'avg_nsp_loss': 0.6953131553721317, 'avg_mlm_loss': 7.005025926134701, 'avg_acc': 50.60452418096724, 'loss': 7.120510101318359}


EP_train:1:  44%|| 651/1490 [44:04<57:57,  4.14s/it]

{'epoch': 1, 'iter': 650, 'mode': 'train', 'avg_loss': 7.697943135157525, 'avg_nsp_loss': 0.6952633747856738, 'avg_mlm_loss': 7.00267976110432, 'avg_acc': 50.68164362519201, 'loss': 7.897661209106445}


EP_train:1:  44%|| 661/1490 [44:45<1:00:47,  4.40s/it]

{'epoch': 1, 'iter': 660, 'mode': 'train', 'avg_loss': 7.698451891969805, 'avg_nsp_loss': 0.6954393877384342, 'avg_mlm_loss': 7.003012505674146, 'avg_acc': 50.60041603630863, 'loss': 8.294984817504883}


EP_train:1:  45%|| 671/1490 [45:25<54:59,  4.03s/it]

{'epoch': 1, 'iter': 670, 'mode': 'train', 'avg_loss': 7.699284429877063, 'avg_nsp_loss': 0.6953365942227325, 'avg_mlm_loss': 7.003947837164434, 'avg_acc': 50.652011922503725, 'loss': 7.755734443664551}


EP_train:1:  46%|| 681/1490 [46:07<54:31,  4.04s/it]

{'epoch': 1, 'iter': 680, 'mode': 'train', 'avg_loss': 7.698541509597487, 'avg_nsp_loss': 0.6954803403778749, 'avg_mlm_loss': 7.003061171320216, 'avg_acc': 50.637848751835534, 'loss': 8.244986534118652}


EP_train:1:  46%|| 691/1490 [46:45<50:10,  3.77s/it]

{'epoch': 1, 'iter': 690, 'mode': 'train', 'avg_loss': 7.698085524066312, 'avg_nsp_loss': 0.6954457620118358, 'avg_mlm_loss': 7.002639762917061, 'avg_acc': 50.642185238784364, 'loss': 7.530472755432129}


EP_train:1:  47%|| 701/1490 [47:30<57:12,  4.35s/it]

{'epoch': 1, 'iter': 700, 'mode': 'train', 'avg_loss': 7.696656714832562, 'avg_nsp_loss': 0.695472382274742, 'avg_mlm_loss': 7.001184333578156, 'avg_acc': 50.61965049928673, 'loss': 6.975205898284912}


EP_train:1:  48%|| 711/1490 [48:09<53:44,  4.14s/it]

{'epoch': 1, 'iter': 710, 'mode': 'train', 'avg_loss': 7.694422495180712, 'avg_nsp_loss': 0.6954456950038798, 'avg_mlm_loss': 6.998976800679825, 'avg_acc': 50.637306610407876, 'loss': 6.309720039367676}


EP_train:1:  48%|| 721/1490 [48:48<49:33,  3.87s/it]

{'epoch': 1, 'iter': 720, 'mode': 'train', 'avg_loss': 7.694698297365694, 'avg_nsp_loss': 0.695421734894529, 'avg_mlm_loss': 6.9992765639592145, 'avg_acc': 50.6371359223301, 'loss': 7.4297895431518555}


EP_train:1:  49%|| 731/1490 [49:33<55:26,  4.38s/it]

{'epoch': 1, 'iter': 730, 'mode': 'train', 'avg_loss': 7.691470594314805, 'avg_nsp_loss': 0.6954285801680078, 'avg_mlm_loss': 6.996042016103721, 'avg_acc': 50.564295485636116, 'loss': 7.804975509643555}


EP_train:1:  50%|| 741/1490 [50:11<47:33,  3.81s/it]

{'epoch': 1, 'iter': 740, 'mode': 'train', 'avg_loss': 7.692358991234248, 'avg_nsp_loss': 0.6953943666980655, 'avg_mlm_loss': 6.996964626466697, 'avg_acc': 50.57354925775979, 'loss': 7.967794895172119}


EP_train:1:  50%|| 751/1490 [50:50<46:44,  3.80s/it]

{'epoch': 1, 'iter': 750, 'mode': 'train', 'avg_loss': 7.693931474825355, 'avg_nsp_loss': 0.6954092412750509, 'avg_mlm_loss': 6.998522237201188, 'avg_acc': 50.56591211717709, 'loss': 7.950681686401367}


EP_train:1:  51%|| 761/1490 [51:32<54:14,  4.47s/it]

{'epoch': 1, 'iter': 760, 'mode': 'train', 'avg_loss': 7.693295774572626, 'avg_nsp_loss': 0.6953763357129893, 'avg_mlm_loss': 6.99791944293249, 'avg_acc': 50.59132720105125, 'loss': 7.962446212768555}


EP_train:1:  52%|| 771/1490 [52:14<47:28,  3.96s/it]

{'epoch': 1, 'iter': 770, 'mode': 'train', 'avg_loss': 7.692144010472081, 'avg_nsp_loss': 0.6953747311918652, 'avg_mlm_loss': 6.996769283532169, 'avg_acc': 50.559338521400775, 'loss': 7.877717018127441}


EP_train:1:  52%|| 781/1490 [52:56<49:44,  4.21s/it]

{'epoch': 1, 'iter': 780, 'mode': 'train', 'avg_loss': 7.692885566154607, 'avg_nsp_loss': 0.6953221817480617, 'avg_mlm_loss': 6.997563389596194, 'avg_acc': 50.60019206145967, 'loss': 7.950802803039551}


EP_train:1:  53%|| 791/1490 [53:40<47:57,  4.12s/it]

{'epoch': 1, 'iter': 790, 'mode': 'train', 'avg_loss': 7.690090637592841, 'avg_nsp_loss': 0.6953147939122583, 'avg_mlm_loss': 6.994775848653917, 'avg_acc': 50.60445638432364, 'loss': 7.5394673347473145}


EP_train:1:  54%|| 801/1490 [54:19<42:58,  3.74s/it]

{'epoch': 1, 'iter': 800, 'mode': 'train', 'avg_loss': 7.690725670622827, 'avg_nsp_loss': 0.6952350406015708, 'avg_mlm_loss': 6.995490635527802, 'avg_acc': 50.682740324594256, 'loss': 7.837271690368652}


EP_train:1:  54%|| 811/1490 [54:57<40:49,  3.61s/it]

{'epoch': 1, 'iter': 810, 'mode': 'train', 'avg_loss': 7.692393142991825, 'avg_nsp_loss': 0.6953498630723589, 'avg_mlm_loss': 6.9970432856520945, 'avg_acc': 50.61652281134402, 'loss': 8.056292533874512}


EP_train:1:  55%|| 821/1490 [55:39<42:55,  3.85s/it]

{'epoch': 1, 'iter': 820, 'mode': 'train', 'avg_loss': 7.692533671347726, 'avg_nsp_loss': 0.6953149191064173, 'avg_mlm_loss': 6.9972187592835144, 'avg_acc': 50.650883069427536, 'loss': 7.959969520568848}


EP_train:1:  56%|| 831/1490 [56:18<43:34,  3.97s/it]

{'epoch': 1, 'iter': 830, 'mode': 'train', 'avg_loss': 7.694499349192449, 'avg_nsp_loss': 0.6953991442811188, 'avg_mlm_loss': 6.9991002128729605, 'avg_acc': 50.60544524669074, 'loss': 7.258999824523926}


EP_train:1:  56%|| 841/1490 [56:58<44:26,  4.11s/it]

{'epoch': 1, 'iter': 840, 'mode': 'train', 'avg_loss': 7.69472628002643, 'avg_nsp_loss': 0.6954406268106205, 'avg_mlm_loss': 6.999285661082772, 'avg_acc': 50.5685196195006, 'loss': 7.293613910675049}


EP_train:1:  57%|| 851/1490 [57:36<39:34,  3.72s/it]

{'epoch': 1, 'iter': 850, 'mode': 'train', 'avg_loss': 7.693764738134436, 'avg_nsp_loss': 0.6953929902944105, 'avg_mlm_loss': 6.998371755754626, 'avg_acc': 50.60957696827262, 'loss': 7.9632463455200195}


EP_train:1:  58%|| 861/1490 [58:17<42:42,  4.07s/it]

{'epoch': 1, 'iter': 860, 'mode': 'train', 'avg_loss': 7.693327549304143, 'avg_nsp_loss': 0.6954316657456233, 'avg_mlm_loss': 6.997895891519651, 'avg_acc': 50.56257259001161, 'loss': 7.35750150680542}


EP_train:1:  58%|| 871/1490 [58:56<41:45,  4.05s/it]

{'epoch': 1, 'iter': 870, 'mode': 'train', 'avg_loss': 7.693105658762216, 'avg_nsp_loss': 0.6954181036226511, 'avg_mlm_loss': 6.9976875625987, 'avg_acc': 50.556113662456944, 'loss': 8.132981300354004}


EP_train:1:  59%|| 881/1490 [59:32<37:15,  3.67s/it]

{'epoch': 1, 'iter': 880, 'mode': 'train', 'avg_loss': 7.694448332510525, 'avg_nsp_loss': 0.6954339154056198, 'avg_mlm_loss': 6.99901442468234, 'avg_acc': 50.53206583427923, 'loss': 8.206183433532715}


EP_train:1:  60%|| 891/1490 [1:00:10<37:45,  3.78s/it]

{'epoch': 1, 'iter': 890, 'mode': 'train', 'avg_loss': 7.6935494430404985, 'avg_nsp_loss': 0.6953910851451818, 'avg_mlm_loss': 6.9981583655215, 'avg_acc': 50.57519640852974, 'loss': 7.342394828796387}


EP_train:1:  60%|| 901/1490 [1:00:45<35:12,  3.59s/it]

{'epoch': 1, 'iter': 900, 'mode': 'train', 'avg_loss': 7.696057055025598, 'avg_nsp_loss': 0.6954619994306406, 'avg_mlm_loss': 7.000595062607268, 'avg_acc': 50.55147058823529, 'loss': 7.598716735839844}


EP_train:1:  61%|| 911/1490 [1:01:25<40:54,  4.24s/it]

{'epoch': 1, 'iter': 910, 'mode': 'train', 'avg_loss': 7.693112237262412, 'avg_nsp_loss': 0.6954500730814971, 'avg_mlm_loss': 6.997662171050824, 'avg_acc': 50.54884742041712, 'loss': 7.254721164703369}


EP_train:1:  62%|| 921/1490 [1:02:03<36:17,  3.83s/it]

{'epoch': 1, 'iter': 920, 'mode': 'train', 'avg_loss': 7.692998482272369, 'avg_nsp_loss': 0.695431529763726, 'avg_mlm_loss': 6.997566958850939, 'avg_acc': 50.542888165038, 'loss': 7.823469161987305}


EP_train:1:  62%|| 931/1490 [1:02:39<33:34,  3.60s/it]

{'epoch': 1, 'iter': 930, 'mode': 'train', 'avg_loss': 7.692012827839426, 'avg_nsp_loss': 0.6954250260787415, 'avg_mlm_loss': 6.996587808611057, 'avg_acc': 50.537056928034374, 'loss': 7.727704048156738}


EP_train:1:  63%|| 941/1490 [1:03:18<34:29,  3.77s/it]

{'epoch': 1, 'iter': 940, 'mode': 'train', 'avg_loss': 7.691710322113524, 'avg_nsp_loss': 0.6954585556396136, 'avg_mlm_loss': 6.996251773124801, 'avg_acc': 50.50146121147715, 'loss': 7.750909805297852}


EP_train:1:  64%|| 951/1490 [1:03:59<37:38,  4.19s/it]

{'epoch': 1, 'iter': 950, 'mode': 'train', 'avg_loss': 7.689346178096928, 'avg_nsp_loss': 0.6955077971569747, 'avg_mlm_loss': 6.993838387959638, 'avg_acc': 50.47975814931651, 'loss': 7.701258659362793}


EP_train:1:  64%|| 961/1490 [1:04:39<32:32,  3.69s/it]

{'epoch': 1, 'iter': 960, 'mode': 'train', 'avg_loss': 7.689909743964982, 'avg_nsp_loss': 0.6955052816408854, 'avg_mlm_loss': 6.994404469766924, 'avg_acc': 50.416233090530696, 'loss': 7.75327730178833}


EP_train:1:  65%|| 971/1490 [1:05:18<33:19,  3.85s/it]

{'epoch': 1, 'iter': 970, 'mode': 'train', 'avg_loss': 7.6903866248322315, 'avg_nsp_loss': 0.6955005409911535, 'avg_mlm_loss': 6.994886090409253, 'avg_acc': 50.408728115345, 'loss': 7.6758270263671875}


EP_train:1:  66%|| 981/1490 [1:05:57<33:49,  3.99s/it]

{'epoch': 1, 'iter': 980, 'mode': 'train', 'avg_loss': 7.690778317193859, 'avg_nsp_loss': 0.6954767336417654, 'avg_mlm_loss': 6.99530159029635, 'avg_acc': 50.430045871559635, 'loss': 8.263396263122559}


EP_train:1:  67%|| 991/1490 [1:06:35<30:19,  3.65s/it]

{'epoch': 1, 'iter': 990, 'mode': 'train', 'avg_loss': 7.6896140231374055, 'avg_nsp_loss': 0.6954560284657868, 'avg_mlm_loss': 6.994158001889134, 'avg_acc': 50.435166498486375, 'loss': 7.846174716949463}


EP_train:1:  67%|| 1001/1490 [1:07:14<30:13,  3.71s/it]

{'epoch': 1, 'iter': 1000, 'mode': 'train', 'avg_loss': 7.690407075605669, 'avg_nsp_loss': 0.6954337080994567, 'avg_mlm_loss': 6.994973375127985, 'avg_acc': 50.41208791208791, 'loss': 7.937083721160889}


EP_train:1:  68%|| 1011/1490 [1:07:53<31:01,  3.89s/it]

{'epoch': 1, 'iter': 1010, 'mode': 'train', 'avg_loss': 7.688772092115514, 'avg_nsp_loss': 0.6954280258875808, 'avg_mlm_loss': 6.993344073479537, 'avg_acc': 50.3771018793274, 'loss': 6.829769134521484}


EP_train:1:  69%|| 1021/1490 [1:08:36<30:57,  3.96s/it]

{'epoch': 1, 'iter': 1020, 'mode': 'train', 'avg_loss': 7.688671728072507, 'avg_nsp_loss': 0.695413556493578, 'avg_mlm_loss': 6.993258178292479, 'avg_acc': 50.40095494613125, 'loss': 7.765303134918213}


EP_train:1:  69%|| 1031/1490 [1:09:16<31:05,  4.06s/it]

{'epoch': 1, 'iter': 1030, 'mode': 'train', 'avg_loss': 7.689034763062382, 'avg_nsp_loss': 0.6954351712151943, 'avg_mlm_loss': 6.993599598900308, 'avg_acc': 50.38494180407371, 'loss': 7.386463165283203}


EP_train:1:  70%|| 1041/1490 [1:09:55<29:17,  3.91s/it]

{'epoch': 1, 'iter': 1040, 'mode': 'train', 'avg_loss': 7.689484597168555, 'avg_nsp_loss': 0.6954259758494887, 'avg_mlm_loss': 6.994058628705233, 'avg_acc': 50.37223823246878, 'loss': 7.406581401824951}


EP_train:1:  71%|| 1051/1490 [1:10:34<28:59,  3.96s/it]

{'epoch': 1, 'iter': 1050, 'mode': 'train', 'avg_loss': 7.688839597547769, 'avg_nsp_loss': 0.6954115862850911, 'avg_mlm_loss': 6.993428017954731, 'avg_acc': 50.35977640342531, 'loss': 8.255762100219727}


EP_train:1:  71%|| 1061/1490 [1:11:14<30:31,  4.27s/it]

{'epoch': 1, 'iter': 1060, 'mode': 'train', 'avg_loss': 7.688277990159619, 'avg_nsp_loss': 0.6953663053544032, 'avg_mlm_loss': 6.992911692052153, 'avg_acc': 50.39172950047125, 'loss': 7.176748275756836}


EP_train:1:  72%|| 1071/1490 [1:11:51<25:12,  3.61s/it]

{'epoch': 1, 'iter': 1070, 'mode': 'train', 'avg_loss': 7.68934549426053, 'avg_nsp_loss': 0.6954031182047811, 'avg_mlm_loss': 6.993942383179366, 'avg_acc': 50.36472922502334, 'loss': 7.7889299392700195}


EP_train:1:  73%|| 1081/1490 [1:12:29<27:57,  4.10s/it]

{'epoch': 1, 'iter': 1080, 'mode': 'train', 'avg_loss': 7.689607875199367, 'avg_nsp_loss': 0.695385480842361, 'avg_mlm_loss': 6.994222401469863, 'avg_acc': 50.34111933395005, 'loss': 7.715754508972168}


EP_train:1:  73%|| 1091/1490 [1:13:08<25:36,  3.85s/it]

{'epoch': 1, 'iter': 1090, 'mode': 'train', 'avg_loss': 7.688538391802094, 'avg_nsp_loss': 0.6953790442971982, 'avg_mlm_loss': 6.993159355754485, 'avg_acc': 50.335128322639775, 'loss': 7.79604434967041}


EP_train:1:  74%|| 1101/1490 [1:13:48<26:22,  4.07s/it]

{'epoch': 1, 'iter': 1100, 'mode': 'train', 'avg_loss': 7.687842617242797, 'avg_nsp_loss': 0.6953432033757098, 'avg_mlm_loss': 6.992499421229696, 'avg_acc': 50.346276112624885, 'loss': 6.974954128265381}


EP_train:1:  75%|| 1111/1490 [1:14:29<27:03,  4.28s/it]

{'epoch': 1, 'iter': 1110, 'mode': 'train', 'avg_loss': 7.686883780035642, 'avg_nsp_loss': 0.6953737761500073, 'avg_mlm_loss': 6.991510010538178, 'avg_acc': 50.33753375337534, 'loss': 7.1555304527282715}


EP_train:1:  75%|| 1121/1490 [1:15:10<25:14,  4.11s/it]

{'epoch': 1, 'iter': 1120, 'mode': 'train', 'avg_loss': 7.6870769973741275, 'avg_nsp_loss': 0.695371170324688, 'avg_mlm_loss': 6.991705834068005, 'avg_acc': 50.315008920606594, 'loss': 7.769158363342285}


EP_train:1:  76%|| 1131/1490 [1:15:55<24:56,  4.17s/it]

{'epoch': 1, 'iter': 1130, 'mode': 'train', 'avg_loss': 7.688495465657228, 'avg_nsp_loss': 0.695449566809505, 'avg_mlm_loss': 6.993045905119122, 'avg_acc': 50.287356321839084, 'loss': 8.022727012634277}


EP_train:1:  77%|| 1141/1490 [1:16:35<23:17,  4.01s/it]

{'epoch': 1, 'iter': 1140, 'mode': 'train', 'avg_loss': 7.690091788821007, 'avg_nsp_loss': 0.6954621006583666, 'avg_mlm_loss': 6.994629694013403, 'avg_acc': 50.25744960560912, 'loss': 7.9564337730407715}


EP_train:1:  77%|| 1151/1490 [1:17:22<26:59,  4.78s/it]

{'epoch': 1, 'iter': 1150, 'mode': 'train', 'avg_loss': 7.687569248686657, 'avg_nsp_loss': 0.6954445183328916, 'avg_mlm_loss': 6.992124736050126, 'avg_acc': 50.228062554300614, 'loss': 7.580451965332031}


EP_train:1:  78%|| 1161/1490 [1:18:09<26:00,  4.74s/it]

{'epoch': 1, 'iter': 1160, 'mode': 'train', 'avg_loss': 7.685577284971462, 'avg_nsp_loss': 0.6954377858829744, 'avg_mlm_loss': 6.990139504119715, 'avg_acc': 50.231481481481474, 'loss': 7.292737007141113}


EP_train:1:  79%|| 1171/1490 [1:18:52<24:27,  4.60s/it]

{'epoch': 1, 'iter': 1170, 'mode': 'train', 'avg_loss': 7.686295497936636, 'avg_nsp_loss': 0.6954311415947573, 'avg_mlm_loss': 6.99086436199185, 'avg_acc': 50.2481853116994, 'loss': 7.883467674255371}


EP_train:1:  79%|| 1181/1490 [1:19:36<22:23,  4.35s/it]

{'epoch': 1, 'iter': 1180, 'mode': 'train', 'avg_loss': 7.686569664056376, 'avg_nsp_loss': 0.6954203616957055, 'avg_mlm_loss': 6.99114930801327, 'avg_acc': 50.21697713801863, 'loss': 8.035518646240234}


EP_train:1:  80%|| 1191/1490 [1:20:18<20:58,  4.21s/it]

{'epoch': 1, 'iter': 1190, 'mode': 'train', 'avg_loss': 7.686102409506925, 'avg_nsp_loss': 0.6954326609620319, 'avg_mlm_loss': 6.990669753749665, 'avg_acc': 50.199412258606216, 'loss': 7.047286033630371}


EP_train:1:  81%|| 1201/1490 [1:20:58<19:27,  4.04s/it]

{'epoch': 1, 'iter': 1200, 'mode': 'train', 'avg_loss': 7.686578731552746, 'avg_nsp_loss': 0.6954130892650372, 'avg_mlm_loss': 6.991165646704706, 'avg_acc': 50.19254787676935, 'loss': 7.770238876342773}


EP_train:1:  81%|| 1211/1490 [1:21:41<19:04,  4.10s/it]

{'epoch': 1, 'iter': 1210, 'mode': 'train', 'avg_loss': 7.686930655645596, 'avg_nsp_loss': 0.6954073212936237, 'avg_mlm_loss': 6.99152333907703, 'avg_acc': 50.18063583815029, 'loss': 7.944622039794922}


EP_train:1:  82%|| 1221/1490 [1:22:24<20:11,  4.50s/it]

{'epoch': 1, 'iter': 1220, 'mode': 'train', 'avg_loss': 7.687407397130393, 'avg_nsp_loss': 0.6954199870915612, 'avg_mlm_loss': 6.991987415066906, 'avg_acc': 50.1586814086814, 'loss': 7.974306106567383}


EP_train:1:  83%|| 1231/1490 [1:23:04<17:00,  3.94s/it]

{'epoch': 1, 'iter': 1230, 'mode': 'train', 'avg_loss': 7.688858959173207, 'avg_nsp_loss': 0.6954039451458123, 'avg_mlm_loss': 6.993455018772525, 'avg_acc': 50.167546709991875, 'loss': 7.631617546081543}


EP_train:1:  83%|| 1241/1490 [1:23:44<16:27,  3.97s/it]

{'epoch': 1, 'iter': 1240, 'mode': 'train', 'avg_loss': 7.689206950996884, 'avg_nsp_loss': 0.695418716773787, 'avg_mlm_loss': 6.993788238113489, 'avg_acc': 50.13346091861403, 'loss': 7.745174407958984}


EP_train:1:  84%|| 1251/1490 [1:24:22<15:19,  3.85s/it]

{'epoch': 1, 'iter': 1250, 'mode': 'train', 'avg_loss': 7.688688380731572, 'avg_nsp_loss': 0.6954200411681458, 'avg_mlm_loss': 6.9932683430892, 'avg_acc': 50.09742206235012, 'loss': 8.033339500427246}


EP_train:1:  85%|| 1261/1490 [1:25:06<17:32,  4.60s/it]

{'epoch': 1, 'iter': 1260, 'mode': 'train', 'avg_loss': 7.68653480369453, 'avg_nsp_loss': 0.6954012595406607, 'avg_mlm_loss': 6.991133546564524, 'avg_acc': 50.08921490880254, 'loss': 6.895249366760254}


EP_train:1:  85%|| 1271/1490 [1:25:47<15:13,  4.17s/it]

{'epoch': 1, 'iter': 1270, 'mode': 'train', 'avg_loss': 7.6846193982146085, 'avg_nsp_loss': 0.6953984421599484, 'avg_mlm_loss': 6.989220957977391, 'avg_acc': 50.07621951219512, 'loss': 7.181767463684082}


EP_train:1:  86%|| 1281/1490 [1:26:27<14:34,  4.18s/it]

{'epoch': 1, 'iter': 1280, 'mode': 'train', 'avg_loss': 7.684297850874604, 'avg_nsp_loss': 0.6953621971523455, 'avg_mlm_loss': 6.988935655955688, 'avg_acc': 50.092701014832166, 'loss': 7.618307113647461}


EP_train:1:  87%|| 1291/1490 [1:27:09<14:12,  4.28s/it]

{'epoch': 1, 'iter': 1290, 'mode': 'train', 'avg_loss': 7.683531931811391, 'avg_nsp_loss': 0.6953558765299832, 'avg_mlm_loss': 6.9881760569896745, 'avg_acc': 50.10892718822618, 'loss': 8.140323638916016}


EP_train:1:  87%|| 1301/1490 [1:27:49<12:55,  4.10s/it]

{'epoch': 1, 'iter': 1300, 'mode': 'train', 'avg_loss': 7.68366580738827, 'avg_nsp_loss': 0.6953808597470136, 'avg_mlm_loss': 6.988284950344312, 'avg_acc': 50.09367794004612, 'loss': 7.504725933074951}


EP_train:1:  88%|| 1311/1490 [1:28:27<11:25,  3.83s/it]

{'epoch': 1, 'iter': 1310, 'mode': 'train', 'avg_loss': 7.682714668145897, 'avg_nsp_loss': 0.6953681976535137, 'avg_mlm_loss': 6.987346472765633, 'avg_acc': 50.09773073989321, 'loss': 7.377573490142822}


EP_train:1:  89%|| 1321/1490 [1:29:08<11:19,  4.02s/it]

{'epoch': 1, 'iter': 1320, 'mode': 'train', 'avg_loss': 7.6823631672133645, 'avg_nsp_loss': 0.695358065697932, 'avg_mlm_loss': 6.987005104177563, 'avg_acc': 50.09462528387585, 'loss': 7.828757286071777}


EP_train:1:  89%|| 1331/1490 [1:29:46<09:46,  3.69s/it]

{'epoch': 1, 'iter': 1330, 'mode': 'train', 'avg_loss': 7.681607878002164, 'avg_nsp_loss': 0.6953329649928098, 'avg_mlm_loss': 6.986274916278429, 'avg_acc': 50.08217505634861, 'loss': 7.8239521980285645}


EP_train:1:  90%|| 1341/1490 [1:30:23<09:03,  3.65s/it]

{'epoch': 1, 'iter': 1340, 'mode': 'train', 'avg_loss': 7.6820236653378435, 'avg_nsp_loss': 0.6953291314293607, 'avg_mlm_loss': 6.986694537197628, 'avg_acc': 50.09088366890381, 'loss': 7.745208740234375}


EP_train:1:  91%|| 1351/1490 [1:31:02<09:00,  3.89s/it]

{'epoch': 1, 'iter': 1350, 'mode': 'train', 'avg_loss': 7.6809402588647355, 'avg_nsp_loss': 0.6953008216809203, 'avg_mlm_loss': 6.985639440845684, 'avg_acc': 50.11796817172465, 'loss': 8.249371528625488}


EP_train:1:  91%|| 1361/1490 [1:31:41<08:09,  3.79s/it]

{'epoch': 1, 'iter': 1360, 'mode': 'train', 'avg_loss': 7.681812505350666, 'avg_nsp_loss': 0.6953062091479838, 'avg_mlm_loss': 6.986506299531083, 'avg_acc': 50.096436443791326, 'loss': 7.7763352394104}


EP_train:1:  92%|| 1371/1490 [1:32:22<08:16,  4.17s/it]

{'epoch': 1, 'iter': 1370, 'mode': 'train', 'avg_loss': 7.680186182282251, 'avg_nsp_loss': 0.6952537397760966, 'avg_mlm_loss': 6.984932445940704, 'avg_acc': 50.13676148796499, 'loss': 6.638232231140137}


EP_train:1:  93%|| 1381/1490 [1:33:02<07:01,  3.87s/it]

{'epoch': 1, 'iter': 1380, 'mode': 'train', 'avg_loss': 7.680191931562265, 'avg_nsp_loss': 0.6953015750858078, 'avg_mlm_loss': 6.9848903601451, 'avg_acc': 50.12898262128892, 'loss': 7.683004379272461}


EP_train:1:  93%|| 1391/1490 [1:33:43<06:37,  4.02s/it]

{'epoch': 1, 'iter': 1390, 'mode': 'train', 'avg_loss': 7.679500250058753, 'avg_nsp_loss': 0.695304572625095, 'avg_mlm_loss': 6.984195680775975, 'avg_acc': 50.1078360891445, 'loss': 7.607997894287109}


EP_train:1:  94%|| 1401/1490 [1:34:25<06:30,  4.39s/it]

{'epoch': 1, 'iter': 1400, 'mode': 'train', 'avg_loss': 7.677183486835009, 'avg_nsp_loss': 0.6952947304556831, 'avg_mlm_loss': 6.981888759825418, 'avg_acc': 50.107066381156315, 'loss': 7.0470170974731445}


EP_train:1:  95%|| 1411/1490 [1:35:03<04:50,  3.68s/it]

{'epoch': 1, 'iter': 1410, 'mode': 'train', 'avg_loss': 7.677388862039416, 'avg_nsp_loss': 0.6952887462184076, 'avg_mlm_loss': 6.982100119073707, 'avg_acc': 50.088589652728565, 'loss': 7.702351093292236}


EP_train:1:  95%|| 1421/1490 [1:35:42<04:14,  3.68s/it]

{'epoch': 1, 'iter': 1420, 'mode': 'train', 'avg_loss': 7.677631594277368, 'avg_nsp_loss': 0.6953085108511385, 'avg_mlm_loss': 6.98232308682382, 'avg_acc': 50.05717804363125, 'loss': 7.969484806060791}


EP_train:1:  96%|| 1431/1490 [1:36:24<04:26,  4.52s/it]

{'epoch': 1, 'iter': 1430, 'mode': 'train', 'avg_loss': 7.678527847359349, 'avg_nsp_loss': 0.6953180144501265, 'avg_mlm_loss': 6.983209836241418, 'avg_acc': 50.024021663172604, 'loss': 7.22727108001709}


EP_train:1:  97%|| 1441/1490 [1:37:04<03:15,  3.99s/it]

{'epoch': 1, 'iter': 1440, 'mode': 'train', 'avg_loss': 7.677757361463008, 'avg_nsp_loss': 0.6953032097826395, 'avg_mlm_loss': 6.98245415498944, 'avg_acc': 50.01518043025677, 'loss': 7.36961555480957}


EP_train:1:  97%|| 1451/1490 [1:37:44<02:38,  4.07s/it]

{'epoch': 1, 'iter': 1450, 'mode': 'train', 'avg_loss': 7.677667967456526, 'avg_nsp_loss': 0.695283791839789, 'avg_mlm_loss': 6.982384178903002, 'avg_acc': 50.02799793246038, 'loss': 7.863082408905029}


EP_train:1:  98%|| 1461/1490 [1:38:23<01:51,  3.84s/it]

{'epoch': 1, 'iter': 1460, 'mode': 'train', 'avg_loss': 7.677421025426971, 'avg_nsp_loss': 0.6952621395011865, 'avg_mlm_loss': 6.9821588896791225, 'avg_acc': 50.04705681040383, 'loss': 7.502596378326416}


EP_train:1:  99%|| 1471/1490 [1:39:02<01:12,  3.79s/it]

{'epoch': 1, 'iter': 1470, 'mode': 'train', 'avg_loss': 7.6765221285220315, 'avg_nsp_loss': 0.6952752859555159, 'avg_mlm_loss': 6.981246846132259, 'avg_acc': 50.03611488783141, 'loss': 7.976295471191406}


EP_train:1:  99%|| 1481/1490 [1:39:45<00:39,  4.41s/it]

{'epoch': 1, 'iter': 1480, 'mode': 'train', 'avg_loss': 7.675003892098787, 'avg_nsp_loss': 0.6952625127920502, 'avg_mlm_loss': 6.979741382405695, 'avg_acc': 50.050641458474, 'loss': 6.751480579376221}


EP_train:1: 100%|| 1490/1490 [1:40:22<00:00,  4.04s/it]


EP1, train: avg_loss=7.6741, nsp_loss=0.6953, mlm_loss=6.9788, total_acc=50.06
Saved model after epoch 1 to /content/drive/MyDrive/email_bert/checkpoints/bert_email_epoch_1.pth


EP_train:2:   0%|| 1/1490 [00:04<1:43:34,  4.17s/it]

{'epoch': 2, 'iter': 0, 'mode': 'train', 'avg_loss': 7.242239475250244, 'avg_nsp_loss': 0.6881650686264038, 'avg_mlm_loss': 6.554074287414551, 'avg_acc': 56.25, 'loss': 7.242239475250244}


EP_train:2:   1%|| 11/1490 [00:45<1:33:46,  3.80s/it]

{'epoch': 2, 'iter': 10, 'mode': 'train', 'avg_loss': 7.538918972015381, 'avg_nsp_loss': 0.6953351118347861, 'avg_mlm_loss': 6.84358380057595, 'avg_acc': 49.14772727272727, 'loss': 7.875093936920166}


EP_train:2:   1%|| 21/1490 [01:26<1:41:11,  4.13s/it]

{'epoch': 2, 'iter': 20, 'mode': 'train', 'avg_loss': 7.619646344866071, 'avg_nsp_loss': 0.6944102701686677, 'avg_mlm_loss': 6.925236066182454, 'avg_acc': 49.107142857142854, 'loss': 7.808437347412109}


EP_train:2:   2%|| 31/1490 [02:06<1:34:39,  3.89s/it]

{'epoch': 2, 'iter': 30, 'mode': 'train', 'avg_loss': 7.623232010872133, 'avg_nsp_loss': 0.693892321278972, 'avg_mlm_loss': 6.929339670365857, 'avg_acc': 50.1008064516129, 'loss': 7.661066055297852}


EP_train:2:   3%|| 41/1490 [02:49<1:39:57,  4.14s/it]

{'epoch': 2, 'iter': 40, 'mode': 'train', 'avg_loss': 7.6127407260057405, 'avg_nsp_loss': 0.6938551838805036, 'avg_mlm_loss': 6.91888554503278, 'avg_acc': 50.762195121951216, 'loss': 7.919590473175049}


EP_train:2:   3%|| 51/1490 [03:28<1:33:21,  3.89s/it]

{'epoch': 2, 'iter': 50, 'mode': 'train', 'avg_loss': 7.621456052742753, 'avg_nsp_loss': 0.6947301778138852, 'avg_mlm_loss': 6.926725873760149, 'avg_acc': 50.73529411764706, 'loss': 7.552306175231934}


EP_train:2:   4%|| 61/1490 [04:07<1:27:03,  3.66s/it]

{'epoch': 2, 'iter': 60, 'mode': 'train', 'avg_loss': 7.627021773916776, 'avg_nsp_loss': 0.6936179604686674, 'avg_mlm_loss': 6.933403820287986, 'avg_acc': 51.48565573770492, 'loss': 7.731189250946045}


EP_train:2:   5%|| 71/1490 [04:53<1:50:49,  4.69s/it]

{'epoch': 2, 'iter': 70, 'mode': 'train', 'avg_loss': 7.6218046201786525, 'avg_nsp_loss': 0.6937668902773253, 'avg_mlm_loss': 6.928037737456846, 'avg_acc': 51.36443661971831, 'loss': 7.66251277923584}


EP_train:2:   5%|| 81/1490 [05:38<1:50:17,  4.70s/it]

{'epoch': 2, 'iter': 80, 'mode': 'train', 'avg_loss': 7.603405946566735, 'avg_nsp_loss': 0.6936921079953512, 'avg_mlm_loss': 6.909713845194122, 'avg_acc': 51.19598765432099, 'loss': 7.813864707946777}


EP_train:2:   6%|| 91/1490 [06:20<1:37:00,  4.16s/it]

{'epoch': 2, 'iter': 90, 'mode': 'train', 'avg_loss': 7.623403685433524, 'avg_nsp_loss': 0.693648371067676, 'avg_mlm_loss': 6.929755320915809, 'avg_acc': 51.20192307692307, 'loss': 7.862419128417969}


EP_train:2:   7%|| 101/1490 [07:03<1:42:39,  4.43s/it]

{'epoch': 2, 'iter': 100, 'mode': 'train', 'avg_loss': 7.6254197441705385, 'avg_nsp_loss': 0.6936305525279282, 'avg_mlm_loss': 6.93178920934696, 'avg_acc': 51.17574257425742, 'loss': 7.652588367462158}


EP_train:2:   7%|| 111/1490 [07:44<1:34:23,  4.11s/it]

{'epoch': 2, 'iter': 110, 'mode': 'train', 'avg_loss': 7.626892308931093, 'avg_nsp_loss': 0.6936267485489717, 'avg_mlm_loss': 6.9332655786394, 'avg_acc': 51.21058558558559, 'loss': 7.492204666137695}


EP_train:2:   8%|| 121/1490 [08:25<1:39:40,  4.37s/it]

{'epoch': 2, 'iter': 120, 'mode': 'train', 'avg_loss': 7.640283088053554, 'avg_nsp_loss': 0.6936944875835387, 'avg_mlm_loss': 6.946588614755425, 'avg_acc': 50.72314049586777, 'loss': 8.173139572143555}


EP_train:2:   9%|| 131/1490 [09:21<1:42:34,  4.53s/it]

{'epoch': 2, 'iter': 130, 'mode': 'train', 'avg_loss': 7.626154997876582, 'avg_nsp_loss': 0.693682327070309, 'avg_mlm_loss': 6.932472676721238, 'avg_acc': 50.667938931297705, 'loss': 6.948806285858154}


EP_train:2:   9%|| 141/1490 [10:03<1:30:57,  4.05s/it]

{'epoch': 2, 'iter': 140, 'mode': 'train', 'avg_loss': 7.626582690164552, 'avg_nsp_loss': 0.6936289845628941, 'avg_mlm_loss': 6.932953712788034, 'avg_acc': 50.576241134751776, 'loss': 8.009688377380371}


EP_train:2:  10%|| 151/1490 [10:42<1:29:31,  4.01s/it]

{'epoch': 2, 'iter': 150, 'mode': 'train', 'avg_loss': 7.624647585761468, 'avg_nsp_loss': 0.6937523057918675, 'avg_mlm_loss': 6.930895284311661, 'avg_acc': 50.289735099337754, 'loss': 7.661654949188232}


EP_train:2:  11%|| 161/1490 [11:19<1:24:48,  3.83s/it]

{'epoch': 2, 'iter': 160, 'mode': 'train', 'avg_loss': 7.623833851784653, 'avg_nsp_loss': 0.6937979644870166, 'avg_mlm_loss': 6.930035896182801, 'avg_acc': 50.213509316770185, 'loss': 7.606718063354492}


EP_train:2:  11%|| 171/1490 [11:58<1:24:18,  3.83s/it]

{'epoch': 2, 'iter': 170, 'mode': 'train', 'avg_loss': 7.626461530986585, 'avg_nsp_loss': 0.6939200584651434, 'avg_mlm_loss': 6.93254147914418, 'avg_acc': 50.12792397660819, 'loss': 7.81057596206665}


EP_train:2:  12%|| 181/1490 [12:38<1:30:48,  4.16s/it]

{'epoch': 2, 'iter': 180, 'mode': 'train', 'avg_loss': 7.61499580889117, 'avg_nsp_loss': 0.6939285542424871, 'avg_mlm_loss': 6.921067264198598, 'avg_acc': 50.18991712707182, 'loss': 7.305786609649658}


EP_train:2:  13%|| 191/1490 [13:14<1:15:58,  3.51s/it]

{'epoch': 2, 'iter': 190, 'mode': 'train', 'avg_loss': 7.622592758757905, 'avg_nsp_loss': 0.6938916035971716, 'avg_mlm_loss': 6.9287011660830515, 'avg_acc': 50.179973821989535, 'loss': 8.103616714477539}


EP_train:2:  13%|| 201/1490 [13:54<1:35:29,  4.44s/it]

{'epoch': 2, 'iter': 200, 'mode': 'train', 'avg_loss': 7.6112986797124, 'avg_nsp_loss': 0.6940138046421221, 'avg_mlm_loss': 6.9172848824837905, 'avg_acc': 49.96890547263681, 'loss': 6.488928318023682}


EP_train:2:  14%|| 211/1490 [14:34<1:23:15,  3.91s/it]

{'epoch': 2, 'iter': 210, 'mode': 'train', 'avg_loss': 7.6106663405612744, 'avg_nsp_loss': 0.6940909415059745, 'avg_mlm_loss': 6.916575404705029, 'avg_acc': 49.88151658767773, 'loss': 7.930776596069336}


EP_train:2:  15%|| 221/1490 [15:13<1:21:31,  3.85s/it]

{'epoch': 2, 'iter': 220, 'mode': 'train', 'avg_loss': 7.613435777603771, 'avg_nsp_loss': 0.6940854256509116, 'avg_mlm_loss': 6.919350354380198, 'avg_acc': 49.73133484162896, 'loss': 7.239068984985352}


EP_train:2:  16%|| 231/1490 [15:51<1:21:49,  3.90s/it]

{'epoch': 2, 'iter': 230, 'mode': 'train', 'avg_loss': 7.61130681388822, 'avg_nsp_loss': 0.6940808892250061, 'avg_mlm_loss': 6.917225926469415, 'avg_acc': 49.74296536796537, 'loss': 7.994170188903809}


EP_train:2:  16%|| 241/1490 [16:32<1:18:49,  3.79s/it]

{'epoch': 2, 'iter': 240, 'mode': 'train', 'avg_loss': 7.607298847056029, 'avg_nsp_loss': 0.6939624649360466, 'avg_mlm_loss': 6.9133363858298145, 'avg_acc': 49.88329875518672, 'loss': 7.716989994049072}


EP_train:2:  17%|| 251/1490 [17:09<1:16:31,  3.71s/it]

{'epoch': 2, 'iter': 250, 'mode': 'train', 'avg_loss': 7.605723527323202, 'avg_nsp_loss': 0.6939056703293941, 'avg_mlm_loss': 6.91181786031837, 'avg_acc': 50.0, 'loss': 6.54050874710083}


EP_train:2:  18%|| 261/1490 [17:46<1:14:59,  3.66s/it]

{'epoch': 2, 'iter': 260, 'mode': 'train', 'avg_loss': 7.6122893190932, 'avg_nsp_loss': 0.6938931540054379, 'avg_mlm_loss': 6.91839617148213, 'avg_acc': 50.0838122605364, 'loss': 7.998767375946045}


EP_train:2:  18%|| 271/1490 [18:24<1:16:49,  3.78s/it]

{'epoch': 2, 'iter': 270, 'mode': 'train', 'avg_loss': 7.615823828426234, 'avg_nsp_loss': 0.693874771964506, 'avg_mlm_loss': 6.9219490610805385, 'avg_acc': 50.13837638376384, 'loss': 7.524445533752441}


EP_train:2:  19%|| 281/1490 [19:03<1:17:23,  3.84s/it]

{'epoch': 2, 'iter': 280, 'mode': 'train', 'avg_loss': 7.615879963301254, 'avg_nsp_loss': 0.693929547729017, 'avg_mlm_loss': 6.921950418329748, 'avg_acc': 50.144572953736656, 'loss': 7.836769104003906}


EP_train:2:  20%|| 291/1490 [19:41<1:14:39,  3.74s/it]

{'epoch': 2, 'iter': 290, 'mode': 'train', 'avg_loss': 7.618022828577311, 'avg_nsp_loss': 0.6939229360970435, 'avg_mlm_loss': 6.924099895962325, 'avg_acc': 50.19329896907217, 'loss': 7.4426164627075195}


EP_train:2:  20%|| 301/1490 [20:20<1:16:06,  3.84s/it]

{'epoch': 2, 'iter': 300, 'mode': 'train', 'avg_loss': 7.614878816065994, 'avg_nsp_loss': 0.693873138720807, 'avg_mlm_loss': 6.921005679919475, 'avg_acc': 50.36337209302325, 'loss': 8.087715148925781}


EP_train:2:  21%|| 311/1490 [20:58<1:14:59,  3.82s/it]

{'epoch': 2, 'iter': 310, 'mode': 'train', 'avg_loss': 7.618304447345795, 'avg_nsp_loss': 0.6939803350393412, 'avg_mlm_loss': 6.924324115372931, 'avg_acc': 50.28135048231511, 'loss': 7.087029457092285}


EP_train:2:  22%|| 321/1490 [21:38<1:20:32,  4.13s/it]

{'epoch': 2, 'iter': 320, 'mode': 'train', 'avg_loss': 7.619620719802714, 'avg_nsp_loss': 0.6940081355356353, 'avg_mlm_loss': 6.925612587795079, 'avg_acc': 50.155763239875384, 'loss': 6.780459403991699}


EP_train:2:  22%|| 331/1490 [22:16<1:15:32,  3.91s/it]

{'epoch': 2, 'iter': 330, 'mode': 'train', 'avg_loss': 7.6213328859957326, 'avg_nsp_loss': 0.6939439283757052, 'avg_mlm_loss': 6.927388961941814, 'avg_acc': 50.217145015105736, 'loss': 7.949820041656494}


EP_train:2:  23%|| 341/1490 [22:54<1:11:12,  3.72s/it]

{'epoch': 2, 'iter': 340, 'mode': 'train', 'avg_loss': 7.623717579324224, 'avg_nsp_loss': 0.693996432939233, 'avg_mlm_loss': 6.929721149880865, 'avg_acc': 50.22910557184751, 'loss': 7.275622844696045}


EP_train:2:  24%|| 351/1490 [23:34<1:17:19,  4.07s/it]

{'epoch': 2, 'iter': 350, 'mode': 'train', 'avg_loss': 7.6235300455337915, 'avg_nsp_loss': 0.6939847386121071, 'avg_mlm_loss': 6.929545312865168, 'avg_acc': 50.20477207977208, 'loss': 7.078913688659668}


EP_train:2:  24%|| 361/1490 [24:14<1:09:26,  3.69s/it]

{'epoch': 2, 'iter': 360, 'mode': 'train', 'avg_loss': 7.619647294200358, 'avg_nsp_loss': 0.6941565882796396, 'avg_mlm_loss': 6.925490713515771, 'avg_acc': 50.25969529085873, 'loss': 8.066719055175781}


EP_train:2:  25%|| 371/1490 [24:53<1:11:45,  3.85s/it]

{'epoch': 2, 'iter': 370, 'mode': 'train', 'avg_loss': 7.6190645444103975, 'avg_nsp_loss': 0.6943026472616067, 'avg_mlm_loss': 6.924761905824399, 'avg_acc': 50.24427223719676, 'loss': 6.980616569519043}


EP_train:2:  26%|| 381/1490 [25:32<1:11:15,  3.86s/it]

{'epoch': 2, 'iter': 380, 'mode': 'train', 'avg_loss': 7.624251137881141, 'avg_nsp_loss': 0.6941689446842264, 'avg_mlm_loss': 6.930082199767505, 'avg_acc': 50.36909448818898, 'loss': 7.732395648956299}


EP_train:2:  26%|| 391/1490 [26:14<1:15:53,  4.14s/it]

{'epoch': 2, 'iter': 390, 'mode': 'train', 'avg_loss': 7.6147750734978015, 'avg_nsp_loss': 0.6941777266504819, 'avg_mlm_loss': 6.920597353249865, 'avg_acc': 50.4156010230179, 'loss': 7.633546829223633}


EP_train:2:  27%|| 401/1490 [26:53<1:11:45,  3.95s/it]

{'epoch': 2, 'iter': 400, 'mode': 'train', 'avg_loss': 7.610391897453631, 'avg_nsp_loss': 0.6940843052697598, 'avg_mlm_loss': 6.916307597980833, 'avg_acc': 50.49875311720699, 'loss': 7.698101043701172}


EP_train:2:  28%|| 411/1490 [27:31<1:04:28,  3.58s/it]

{'epoch': 2, 'iter': 410, 'mode': 'train', 'avg_loss': 7.611872176474319, 'avg_nsp_loss': 0.6939437567752643, 'avg_mlm_loss': 6.917928423034594, 'avg_acc': 50.600669099756686, 'loss': 7.824777126312256}


EP_train:2:  28%|| 421/1490 [28:11<1:09:26,  3.90s/it]

{'epoch': 2, 'iter': 420, 'mode': 'train', 'avg_loss': 7.6138120150622735, 'avg_nsp_loss': 0.6940622278743571, 'avg_mlm_loss': 6.919749788886861, 'avg_acc': 50.5270190023753, 'loss': 7.3577165603637695}


EP_train:2:  29%|| 431/1490 [28:47<1:06:13,  3.75s/it]

{'epoch': 2, 'iter': 430, 'mode': 'train', 'avg_loss': 7.614673993980248, 'avg_nsp_loss': 0.6939662100267521, 'avg_mlm_loss': 6.920707785613023, 'avg_acc': 50.60904872389791, 'loss': 7.351911544799805}


EP_train:2:  30%|| 441/1490 [29:25<1:03:59,  3.66s/it]

{'epoch': 2, 'iter': 440, 'mode': 'train', 'avg_loss': 7.61332346576682, 'avg_nsp_loss': 0.6942243986929896, 'avg_mlm_loss': 6.919099068155094, 'avg_acc': 50.538548752834465, 'loss': 7.415793418884277}


EP_train:2:  30%|| 451/1490 [30:05<1:11:20,  4.12s/it]

{'epoch': 2, 'iter': 450, 'mode': 'train', 'avg_loss': 7.6107666518894375, 'avg_nsp_loss': 0.6941798872535351, 'avg_mlm_loss': 6.916586764900225, 'avg_acc': 50.561252771618626, 'loss': 7.217215538024902}


EP_train:2:  31%|| 461/1490 [30:44<1:01:25,  3.58s/it]

{'epoch': 2, 'iter': 460, 'mode': 'train', 'avg_loss': 7.612982409636524, 'avg_nsp_loss': 0.6941723682616642, 'avg_mlm_loss': 6.918810040728389, 'avg_acc': 50.57619305856833, 'loss': 7.673384666442871}


EP_train:2:  32%|| 471/1490 [31:24<1:06:30,  3.92s/it]

{'epoch': 2, 'iter': 470, 'mode': 'train', 'avg_loss': 7.614389044970211, 'avg_nsp_loss': 0.6941548515024205, 'avg_mlm_loss': 6.920234193214692, 'avg_acc': 50.490976645435246, 'loss': 7.709486961364746}


EP_train:2:  32%|| 481/1490 [32:02<1:03:14,  3.76s/it]

{'epoch': 2, 'iter': 480, 'mode': 'train', 'avg_loss': 7.616348999205845, 'avg_nsp_loss': 0.6941584897636128, 'avg_mlm_loss': 6.922190508574805, 'avg_acc': 50.46127858627859, 'loss': 7.963531970977783}


EP_train:2:  33%|| 491/1490 [32:41<1:03:50,  3.83s/it]

{'epoch': 2, 'iter': 490, 'mode': 'train', 'avg_loss': 7.6136087137909865, 'avg_nsp_loss': 0.6941582684361521, 'avg_mlm_loss': 6.919450444262285, 'avg_acc': 50.44551934826884, 'loss': 6.57565975189209}


EP_train:2:  34%|| 501/1490 [33:18<1:00:32,  3.67s/it]

{'epoch': 2, 'iter': 500, 'mode': 'train', 'avg_loss': 7.616097820495179, 'avg_nsp_loss': 0.6941698054353634, 'avg_mlm_loss': 6.921928012680389, 'avg_acc': 50.51771457085829, 'loss': 7.706490516662598}


EP_train:2:  34%|| 511/1490 [33:54<55:10,  3.38s/it]

{'epoch': 2, 'iter': 510, 'mode': 'train', 'avg_loss': 7.621016479984888, 'avg_nsp_loss': 0.69423972102294, 'avg_mlm_loss': 6.926776757212301, 'avg_acc': 50.5075831702544, 'loss': 8.049092292785645}


EP_train:2:  35%|| 521/1490 [34:34<1:00:13,  3.73s/it]

{'epoch': 2, 'iter': 520, 'mode': 'train', 'avg_loss': 7.616553648877281, 'avg_nsp_loss': 0.6942328445742089, 'avg_mlm_loss': 6.922320801557369, 'avg_acc': 50.545825335892516, 'loss': 7.845755100250244}


EP_train:2:  36%|| 531/1490 [35:11<1:01:01,  3.82s/it]

{'epoch': 2, 'iter': 530, 'mode': 'train', 'avg_loss': 7.616958832785683, 'avg_nsp_loss': 0.6942612993515144, 'avg_mlm_loss': 6.922697530627924, 'avg_acc': 50.5355461393597, 'loss': 6.886404991149902}


EP_train:2:  36%|| 541/1490 [35:51<1:01:34,  3.89s/it]

{'epoch': 2, 'iter': 540, 'mode': 'train', 'avg_loss': 7.61154087105432, 'avg_nsp_loss': 0.6942034390188629, 'avg_mlm_loss': 6.9173374290607335, 'avg_acc': 50.525646950092415, 'loss': 7.348534107208252}


EP_train:2:  37%|| 551/1490 [36:31<59:08,  3.78s/it]

{'epoch': 2, 'iter': 550, 'mode': 'train', 'avg_loss': 7.607896344415073, 'avg_nsp_loss': 0.69431109902214, 'avg_mlm_loss': 6.913585242255846, 'avg_acc': 50.56715063520871, 'loss': 7.546045303344727}


EP_train:2:  38%|| 561/1490 [37:09<59:28,  3.84s/it]

{'epoch': 2, 'iter': 560, 'mode': 'train', 'avg_loss': 7.607553292510769, 'avg_nsp_loss': 0.6943453317657512, 'avg_mlm_loss': 6.913207955963896, 'avg_acc': 50.59046345811051, 'loss': 6.866390705108643}


EP_train:2:  38%|| 571/1490 [37:48<1:06:09,  4.32s/it]

{'epoch': 2, 'iter': 570, 'mode': 'train', 'avg_loss': 7.607631750156917, 'avg_nsp_loss': 0.6943199686745211, 'avg_mlm_loss': 6.913311776263075, 'avg_acc': 50.63485113835377, 'loss': 7.218629837036133}


EP_train:2:  39%|| 581/1490 [38:26<54:14,  3.58s/it]

{'epoch': 2, 'iter': 580, 'mode': 'train', 'avg_loss': 7.608649829168533, 'avg_nsp_loss': 0.6943312620747315, 'avg_mlm_loss': 6.914318563195391, 'avg_acc': 50.602409638554214, 'loss': 7.652583122253418}


EP_train:2:  40%|| 591/1490 [39:04<58:55,  3.93s/it]

{'epoch': 2, 'iter': 590, 'mode': 'train', 'avg_loss': 7.608883878704658, 'avg_nsp_loss': 0.6943384846656657, 'avg_mlm_loss': 6.914545390811669, 'avg_acc': 50.560490693739425, 'loss': 7.613012790679932}


EP_train:2:  40%|| 601/1490 [39:42<53:59,  3.64s/it]

{'epoch': 2, 'iter': 600, 'mode': 'train', 'avg_loss': 7.607405789481621, 'avg_nsp_loss': 0.6943151856619189, 'avg_mlm_loss': 6.913090601935363, 'avg_acc': 50.556364392678866, 'loss': 7.87324333190918}


EP_train:2:  41%|| 611/1490 [40:21<58:13,  3.97s/it]

{'epoch': 2, 'iter': 610, 'mode': 'train', 'avg_loss': 7.609347229503376, 'avg_nsp_loss': 0.6942950134971918, 'avg_mlm_loss': 6.915052214152684, 'avg_acc': 50.54725859247136, 'loss': 7.499687194824219}


EP_train:2:  42%|| 621/1490 [40:58<53:53,  3.72s/it]

{'epoch': 2, 'iter': 620, 'mode': 'train', 'avg_loss': 7.61051938691193, 'avg_nsp_loss': 0.6943052375566172, 'avg_mlm_loss': 6.916214146667825, 'avg_acc': 50.50825281803542, 'loss': 7.698731899261475}


EP_train:2:  42%|| 631/1490 [41:38<57:08,  3.99s/it]

{'epoch': 2, 'iter': 630, 'mode': 'train', 'avg_loss': 7.610786219595352, 'avg_nsp_loss': 0.6943067395365937, 'avg_mlm_loss': 6.916479476469254, 'avg_acc': 50.5101030110935, 'loss': 7.683958530426025}


EP_train:2:  43%|| 641/1490 [42:15<51:34,  3.65s/it]

{'epoch': 2, 'iter': 640, 'mode': 'train', 'avg_loss': 7.609602876832816, 'avg_nsp_loss': 0.6943314602706064, 'avg_mlm_loss': 6.915271413121692, 'avg_acc': 50.44851794071763, 'loss': 7.823378086090088}


EP_train:2:  44%|| 651/1490 [42:55<56:50,  4.07s/it]

{'epoch': 2, 'iter': 650, 'mode': 'train', 'avg_loss': 7.607088756268292, 'avg_nsp_loss': 0.6943265550330671, 'avg_mlm_loss': 6.912762198030674, 'avg_acc': 50.46082949308756, 'loss': 6.624450206756592}


EP_train:2:  44%|| 661/1490 [43:36<55:19,  4.00s/it]

{'epoch': 2, 'iter': 660, 'mode': 'train', 'avg_loss': 7.606316728058093, 'avg_nsp_loss': 0.6942694730910158, 'avg_mlm_loss': 6.9120472519913525, 'avg_acc': 50.52004538577912, 'loss': 8.169042587280273}


EP_train:2:  45%|| 671/1490 [44:17<52:30,  3.85s/it]

{'epoch': 2, 'iter': 670, 'mode': 'train', 'avg_loss': 7.608310879964587, 'avg_nsp_loss': 0.6942307553774554, 'avg_mlm_loss': 6.914080120945119, 'avg_acc': 50.55421013412816, 'loss': 7.776410102844238}


EP_train:2:  46%|| 681/1490 [44:58<56:07,  4.16s/it]

{'epoch': 2, 'iter': 680, 'mode': 'train', 'avg_loss': 7.6097816048564715, 'avg_nsp_loss': 0.6942777380719234, 'avg_mlm_loss': 6.915503863896217, 'avg_acc': 50.50936123348018, 'loss': 7.677582740783691}


EP_train:2:  46%|| 691/1490 [45:36<49:30,  3.72s/it]

{'epoch': 2, 'iter': 690, 'mode': 'train', 'avg_loss': 7.610151907814567, 'avg_nsp_loss': 0.694266490498086, 'avg_mlm_loss': 6.915885413952398, 'avg_acc': 50.51555716353111, 'loss': 7.797013282775879}


EP_train:2:  47%|| 701/1490 [46:16<50:35,  3.85s/it]

{'epoch': 2, 'iter': 700, 'mode': 'train', 'avg_loss': 7.605532865891613, 'avg_nsp_loss': 0.6942695183692748, 'avg_mlm_loss': 6.911263344121218, 'avg_acc': 50.49037089871612, 'loss': 8.087590217590332}


EP_train:2:  48%|| 711/1490 [46:54<51:31,  3.97s/it]

{'epoch': 2, 'iter': 710, 'mode': 'train', 'avg_loss': 7.60302601651971, 'avg_nsp_loss': 0.6942669972253416, 'avg_mlm_loss': 6.908759015186594, 'avg_acc': 50.46149789029536, 'loss': 7.402276992797852}


EP_train:2:  48%|| 721/1490 [47:33<48:21,  3.77s/it]

{'epoch': 2, 'iter': 720, 'mode': 'train', 'avg_loss': 7.603051170396738, 'avg_nsp_loss': 0.6942615040297647, 'avg_mlm_loss': 6.908789663142867, 'avg_acc': 50.47243411927877, 'loss': 7.7221198081970215}


EP_train:2:  49%|| 731/1490 [48:14<49:55,  3.95s/it]

{'epoch': 2, 'iter': 730, 'mode': 'train', 'avg_loss': 7.602102623429409, 'avg_nsp_loss': 0.6942413088887237, 'avg_mlm_loss': 6.907861311686838, 'avg_acc': 50.50017099863201, 'loss': 7.692418098449707}


EP_train:2:  50%|| 741/1490 [48:53<45:45,  3.67s/it]

{'epoch': 2, 'iter': 740, 'mode': 'train', 'avg_loss': 7.601293552259684, 'avg_nsp_loss': 0.6942597771624001, 'avg_mlm_loss': 6.907033772281951, 'avg_acc': 50.489203778677464, 'loss': 7.832765579223633}


EP_train:2:  50%|| 751/1490 [49:30<48:15,  3.92s/it]

{'epoch': 2, 'iter': 750, 'mode': 'train', 'avg_loss': 7.603808265233961, 'avg_nsp_loss': 0.6942515874829654, 'avg_mlm_loss': 6.909556674576313, 'avg_acc': 50.449400798934754, 'loss': 7.477266311645508}


EP_train:2:  51%|| 761/1490 [50:08<46:39,  3.84s/it]

{'epoch': 2, 'iter': 760, 'mode': 'train', 'avg_loss': 7.603683056248628, 'avg_nsp_loss': 0.6942142733763145, 'avg_mlm_loss': 6.909468780522591, 'avg_acc': 50.42296320630749, 'loss': 7.684605598449707}


EP_train:2:  52%|| 771/1490 [50:48<48:31,  4.05s/it]

{'epoch': 2, 'iter': 770, 'mode': 'train', 'avg_loss': 7.606067044571061, 'avg_nsp_loss': 0.6942447454085146, 'avg_mlm_loss': 6.911822296765991, 'avg_acc': 50.470168612191955, 'loss': 7.261106967926025}


EP_train:2:  52%|| 781/1490 [51:24<42:56,  3.63s/it]

{'epoch': 2, 'iter': 780, 'mode': 'train', 'avg_loss': 7.604844612898198, 'avg_nsp_loss': 0.6943885532139816, 'avg_mlm_loss': 6.910456058463122, 'avg_acc': 50.420134443021766, 'loss': 7.952849864959717}


EP_train:2:  53%|| 791/1490 [52:05<50:56,  4.37s/it]

{'epoch': 2, 'iter': 790, 'mode': 'train', 'avg_loss': 7.605190009443859, 'avg_nsp_loss': 0.694393658306445, 'avg_mlm_loss': 6.910796349931758, 'avg_acc': 50.406921618204805, 'loss': 7.639036178588867}


EP_train:2:  54%|| 801/1490 [52:45<46:23,  4.04s/it]

{'epoch': 2, 'iter': 800, 'mode': 'train', 'avg_loss': 7.605494089638547, 'avg_nsp_loss': 0.6943979621975311, 'avg_mlm_loss': 6.911096126994539, 'avg_acc': 50.42134831460674, 'loss': 7.786386489868164}


EP_train:2:  54%|| 811/1490 [53:23<43:52,  3.88s/it]

{'epoch': 2, 'iter': 810, 'mode': 'train', 'avg_loss': 7.604940323882567, 'avg_nsp_loss': 0.6944046483527865, 'avg_mlm_loss': 6.910535675382791, 'avg_acc': 50.42771270036991, 'loss': 7.793741703033447}


EP_train:2:  55%|| 821/1490 [54:02<45:58,  4.12s/it]

{'epoch': 2, 'iter': 820, 'mode': 'train', 'avg_loss': 7.602700920546389, 'avg_nsp_loss': 0.6943942314959908, 'avg_mlm_loss': 6.908306688687397, 'avg_acc': 50.43011571254568, 'loss': 7.22819709777832}


EP_train:2:  56%|| 831/1490 [54:45<44:53,  4.09s/it]

{'epoch': 2, 'iter': 830, 'mode': 'train', 'avg_loss': 7.602666875515604, 'avg_nsp_loss': 0.6943800955042512, 'avg_mlm_loss': 6.908286779796174, 'avg_acc': 50.44750300842359, 'loss': 7.47489070892334}


EP_train:2:  56%|| 841/1490 [55:24<43:27,  4.02s/it]

{'epoch': 2, 'iter': 840, 'mode': 'train', 'avg_loss': 7.603183709483651, 'avg_nsp_loss': 0.6944107029866095, 'avg_mlm_loss': 6.908773007205777, 'avg_acc': 50.41245541022592, 'loss': 8.241098403930664}


EP_train:2:  57%|| 851/1490 [56:04<41:10,  3.87s/it]

{'epoch': 2, 'iter': 850, 'mode': 'train', 'avg_loss': 7.602259047022998, 'avg_nsp_loss': 0.6944264696011392, 'avg_mlm_loss': 6.907832578822672, 'avg_acc': 50.396592244418336, 'loss': 7.750434398651123}


EP_train:2:  58%|| 861/1490 [56:46<44:43,  4.27s/it]

{'epoch': 2, 'iter': 860, 'mode': 'train', 'avg_loss': 7.602005403631655, 'avg_nsp_loss': 0.6944120628770082, 'avg_mlm_loss': 6.907593341654602, 'avg_acc': 50.38472706155633, 'loss': 8.041398048400879}


EP_train:2:  58%|| 871/1490 [57:23<37:23,  3.62s/it]

{'epoch': 2, 'iter': 870, 'mode': 'train', 'avg_loss': 7.6018869770106985, 'avg_nsp_loss': 0.6944290998335136, 'avg_mlm_loss': 6.907457878887995, 'avg_acc': 50.35519517795637, 'loss': 7.749552249908447}


EP_train:2:  59%|| 879/1490 [57:55<40:16,  3.95s/it]


KeyboardInterrupt: 

In [None]:
!ls /content
enron_dir = '/content/maildir/'

import os, re, email, tqdm

MAX_LEN = 128
email_pairs = []

def fast_body_extract(path):
    try:
        with open(path, "r", errors="ignore") as f:
            msg = email.message_from_string(f.read())
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == "text/plain":
                    try: return part.get_payload(decode=True).decode("latin1")
                    except: return ""
        else:
            try: return msg.get_payload(decode=True).decode("latin1")
            except: return ""
    except:
        return ""

print("Processing emails...")

users = os.listdir(enron_dir)[:30]
for u in users:
    udir = os.path.join(enron_dir, u)
    if not os.path.isdir(udir): continue
    for folder in os.listdir(udir)[:5]:
        fdir = os.path.join(udir, folder)
        if not os.path.isdir(fdir): continue
        for file in os.listdir(fdir)[:50]:
            fpath = os.path.join(fdir, file)
            body = fast_body_extract(fpath)
            if not body: continue

            sents = re.split(r'[.!?]+', body)
            sents = [s.strip() for s in sents if len(s.strip()) > 10]

            for i in range(len(sents)-1):
                email_pairs.append([
                    " ".join(sents[i].split()[:MAX_LEN]),
                    " ".join(sents[i+1].split()[:MAX_LEN])
                ])

print("Pairs:", len(email_pairs))
from pathlib import Path

path = "/content/drive/MyDrive/email_bert/data/"
os.makedirs(path, exist_ok=True)

chunks = []
chunk_id = 0

for t1, t2 in tqdm.tqdm(email_pairs):
    chunks.append(t1)
    chunks.append(t2)
    if len(chunks) >= 10000:
        with open(f"{path}/text_{chunk_id}.txt", "w") as f:
            f.write("\n".join(chunks))
        chunks, chunk_id = [], chunk_id+1

if chunks:
    with open(f"{path}/text_{chunk_id}.txt", "w") as f:
        f.write("\n".join(chunks))

paths = [str(x) for x in Path(path).glob("*.txt")]
print("Text files:", len(paths))
from tokenizers import BertWordPieceTokenizer

tok = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=True
)

tok.train(
    files=paths,
    vocab_size=30000,
    min_frequency=2,
    limit_alphabet=1000,
    special_tokens=["[PAD]","[CLS]","[SEP]","[MASK]","[UNK]"]
)

tok.save_model("/content/drive/MyDrive/email_bert/tokenizer","bert-email")

from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast(
    vocab_file="/content/drive/MyDrive/email_bert/tokenizer/bert-email-vocab.txt",
    lowercase=True,
    strip_accents=False
)

print("Tokenizer size:", len(tokenizer))
import torch
from torch.utils.data import Dataset

class EmailBERTDataset(Dataset):
    def __init__(self, pairs, tokenizer, seq_len=256):
        self.pairs = pairs
        self.tok = tokenizer
        self.seq_len = seq_len
        self.vocab_size = len(tokenizer)

    def __len__(self): return len(self.pairs)

    def __getitem__(self, i):
        t1, t2 = self.pairs[i]

        # 50% random NSP replacement
        if torch.rand(1).item() < 0.5:
            is_next = 1
        else:
            t2 = self.pairs[ torch.randint(len(self.pairs),(1,)).item() ][1]
            is_next = 0

        out = self.tok(
            t1, t2,
            max_length=self.seq_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        ids = out["input_ids"].squeeze()
        seg = out["token_type_ids"].squeeze()

        # Create MLM mask
        mask_prob = 0.15
        rand = torch.rand(ids.shape)

        mask = (rand < mask_prob) & (ids != self.tok.pad_token_id) & (ids != self.tok.cls_token_id) & (ids != self.tok.sep_token_id)
        labels = ids.clone()
        labels[~mask] = -100

        # Replace masked tokens
        ids_masked = ids.clone()
        ids_masked[mask] = self.tok.mask_token_id

        return {
            "input_ids": ids_masked,
            "token_type_ids": seg,
            "labels": labels,
            "is_next": torch.tensor(is_next)
        }
from torch.utils.data import DataLoader

train_dataset = EmailBERTDataset(email_pairs, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

batch = next(iter(train_loader))
print("Batch OK:", batch["input_ids"].shape)
import torch.nn as nn
import torch
from torch.cuda.amp import autocast, GradScaler

class BERTSmall(nn.Module):
    def __init__(self, vocab, d_model=512, layers=8, heads=8, seq_len=256):
        super().__init__()
        self.emb = nn.Embedding(vocab, d_model)
        self.pos = nn.Embedding(seq_len, d_model)
        self.seg = nn.Embedding(2, d_model)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=heads,
            dim_feedforward=d_model*3,
            batch_first=True,
            activation="gelu"
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=layers)

        self.mlm = nn.Linear(d_model, vocab)
        self.nsp = nn.Linear(d_model, 2)

    def forward(self, ids, seg):
        b, L = ids.shape
        pos = torch.arange(L, device=ids.device).unsqueeze(0)

        x = self.emb(ids) + self.pos(pos) + self.seg(seg)
        x = self.encoder(x)

        mlm_logits = self.mlm(x)
        nsp_logits = self.nsp(x[:,0])
        return mlm_logits, nsp_logits
class Trainer:
    def __init__(self, model, loader, device="cuda"):
        self.model = model.to(device)
        self.loader = loader
        self.device = device
        self.opt = torch.optim.AdamW(model.parameters(), lr=3e-4)
        self.scaler = GradScaler()
        self.mlm_loss = nn.CrossEntropyLoss(ignore_index=-100)
        self.nsp_loss = nn.CrossEntropyLoss()

    import os

# -------------------------
# Count model parameters
# -------------------------
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# ==========================================================
#  UPDATED TRAINING FUNCTION WITH METRICS & CHECKPOINTS
# ==========================================================
def train_with_checkpoints(trainer, epochs=100, ckpt_dir="/content/drive/MyDrive/email_bert/checkpoints_small/"):
    os.makedirs(ckpt_dir, exist_ok=True)

    print("\n==============================")
    print("🚀 Training Started")
    print("Total Trainable Parameters:", count_parameters(trainer.model))
    print("==============================\n")

    for ep in range(epochs):
        total = 0
        correct = 0
        mlm_loss_sum = 0
        nsp_loss_sum = 0
        total_loss_sum = 0
        steps = 0

        for batch in tqdm.tqdm(trainer.loader, desc=f"Epoch {ep}"):
            ids = batch["input_ids"].to(trainer.device)
            seg = batch["token_type_ids"].to(trainer.device)
            mlm_labels = batch["labels"].to(trainer.device)
            nsp_labels = batch["is_next"].to(trainer.device)

            with autocast():
                mlm_logits, nsp_logits = trainer.model(ids, seg)
                loss_mlm = trainer.mlm_loss(
                    mlm_logits.view(-1, mlm_logits.size(-1)),
                    mlm_labels.view(-1)
                )
                loss_nsp = trainer.nsp_loss(nsp_logits, nsp_labels)
                loss = loss_mlm + loss_nsp

            trainer.opt.zero_grad()
            trainer.scaler.scale(loss).backward()
            trainer.scaler.step(trainer.opt)
            trainer.scaler.update()

            # Metrics
            correct += (nsp_logits.argmax(-1) == nsp_labels).sum().item()
            total += ids.size(0)

            mlm_loss_sum += loss_mlm.item()
            nsp_loss_sum += loss_nsp.item()
            total_loss_sum += loss.item()
            steps += 1

        # ---- END OF EPOCH METRICS ----
        avg_mlm = mlm_loss_sum / steps
        avg_nsp = nsp_loss_sum / steps
        avg_total = total_loss_sum / steps
        acc = correct / total * 100

        print(f"\n📊 Epoch {ep} Summary")
        print(f"  MLM Loss: {avg_mlm:.4f}")
        print(f"  NSP Loss: {avg_nsp:.4f}")
        print(f"  Total Loss: {avg_total:.4f}")
        print(f"  NSP Accuracy: {acc:.2f}%")
        print("------------------------------")

        # Save every 10 epochs
        if (ep + 1) % 10 == 0:
            save_path = os.path.join(ckpt_dir, f"bert_email_epoch_{ep}.pth")
            torch.save(trainer.model.state_dict(), save_path)
            print(f"💾 Saved checkpoint: {save_path}")

    # Save final model
    final_path = "/content/drive/MyDrive/email_bert/bert_email_83M_final.pth"
    torch.save(trainer.model.state_dict(), final_path)
    print(f"\n🎉 Final model saved at: {final_path}")



In [None]:
train_with_checkpoints(trainer, epochs=100)


In [None]:
!ls -R /content/drive
!ls -R /content/gdrive


In [None]:
from google.colab import files

files.download('/content/drive/MyDrive/email_bert/checkpoints_small/bert_email_epoch_9.pth')
# or
#files.download('/content/drive/MyDrive/email_bert/bert_email_83M.pth')


In [None]:
!ls -R /content/drive
!ls -R /content/gdrive


In [None]:
!ls -lh /content/drive/MyDrive/email_bert/datasets


In [None]:
!ls -l /content/drive/MyDrive/email_bert/tokenizer


In [None]:
!ls /content


In [None]:
from google.colab import drive
drive.mount('/content/drive')
!ls -la /content/drive/MyDrive/email_bert/checkpoints/