<a href="https://colab.research.google.com/github/frank-morales2020/Cloud_curious/blob/master/TRANSFORMER_REASONING_MODEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets -q
!pip install transformers -q
!pip install torch -q

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
from tqdm import tqdm

# --- Hyperparameters ---
BATCH_SIZE = 32
LEARNING_RATE = 5e-5
NUM_EPOCHS = 100
D_MODEL = 256  # Reduced for faster training on a smaller scale
NUM_HEADS = 8
NUM_LAYERS = 3  # Reduced for faster training
D_FF = 512
DROPOUT = 0.1
MAX_LEN = 128  # Maximum sequence length
WARMUP_STEPS = 1000
GRADIENT_CLIPPING = 1.0

# --- Load the GSM8k Dataset ---
gsm8k_dataset = load_dataset("gsm8k", "main")
train_dataset = gsm8k_dataset['train']
test_dataset = gsm8k_dataset['test']

# --- Vocabulary Creation ---
def build_vocabulary(examples):
    tokenizer = set()
    for example in examples:
        text = example['question'] + " " + example['answer']
        tokenizer.update(text.lower().split())
    return sorted(list(tokenizer))

vocabulary = build_vocabulary(train_dataset)
vocab_size = len(vocabulary)
word_to_index = {word: i for i, word in enumerate(vocabulary)}
index_to_word = {i: word for word, i in word_to_index.items()}

# Add special tokens
PAD_TOKEN = "<pad>"
START_TOKEN = "<start>"
END_TOKEN = "<end>"
UNK_TOKEN = "<unk>"
PAD_INDEX = 0
START_INDEX = vocab_size
END_INDEX = vocab_size + 1
UNK_INDEX = vocab_size + 2

word_to_index[PAD_TOKEN] = PAD_INDEX
word_to_index[START_TOKEN] = START_INDEX
word_to_index[END_TOKEN] = END_INDEX
word_to_index[UNK_TOKEN] = UNK_INDEX

index_to_word[PAD_INDEX] = PAD_TOKEN
index_to_word[START_INDEX] = START_TOKEN
index_to_word[END_INDEX] = END_TOKEN
index_to_word[UNK_INDEX] = UNK_TOKEN

updated_vocab_size = len(word_to_index)

# --- Data Processing Function ---
def process_example(example, max_len, word_to_index):
    question = example['question'].lower().split()
    answer = example['answer'].lower().split()

    question_tokens = [word_to_index.get(word, UNK_INDEX) for word in question]
    answer_tokens = [word_to_index.get(word, UNK_INDEX) for word in answer]

    src_tokens = [START_INDEX] + question_tokens + [END_INDEX]
    tgt_tokens = [START_INDEX] + answer_tokens + [END_INDEX]

    src_tokens = src_tokens[:max_len]
    tgt_tokens = tgt_tokens[:max_len]

    src_padding = [PAD_INDEX] * (max_len - len(src_tokens))
    tgt_padding = [PAD_INDEX] * (max_len - len(tgt_tokens))

    src_tensor = torch.tensor(src_tokens + src_padding)
    tgt_input_tensor = torch.tensor([START_INDEX] + answer_tokens[:max_len-1] + tgt_padding[:1]) # Input to decoder
    tgt_output_tensor = torch.tensor(answer_tokens[:max_len-1] + [END_INDEX] + tgt_padding[:1]) # Target for decoder

    return src_tensor, tgt_input_tensor, tgt_output_tensor

# --- Custom Dataset Class ---
class MathDataset(Dataset):
    def __init__(self, dataset, max_len, word_to_index):
        self.dataset = dataset
        self.max_len = max_len
        self.word_to_index = word_to_index

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        example = self.dataset[idx]
        return process_example(example, self.max_len, self.word_to_index)

# --- Create DataLoaders ---
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    src_tensors, tgt_in_tensors, tgt_out_tensors = zip(*batch)
    # pad the sequences within the batch
    src_tensors = pad_sequence(src_tensors, batch_first=True, padding_value=PAD_INDEX)
    tgt_in_tensors = pad_sequence(tgt_in_tensors, batch_first=True, padding_value=PAD_INDEX)
    tgt_out_tensors = pad_sequence(tgt_out_tensors, batch_first=True, padding_value=PAD_INDEX)
    return src_tensors, tgt_in_tensors, tgt_out_tensors

train_dataloader = DataLoader(MathDataset(train_dataset, MAX_LEN, word_to_index), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(MathDataset(test_dataset, MAX_LEN, word_to_index), batch_size=BATCH_SIZE, collate_fn=collate_fn)

# --- Transformer Model Definition ---
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = F.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output, attn_probs

    def split_heads(self, x):
        batch_size, seq_len, d_model = x.size()
        return x.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, num_heads, seq_len, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q_ = self.split_heads(self.W_q(Q))
        K_ = self.split_heads(self.W_k(K))
        V_ = self.split_heads(self.W_v(V))

        output, attn_probs = self.scaled_dot_product_attention(Q_, K_, V_, mask)
        output = self.combine_heads(output)
        output = self.W_o(output)
        return output, attn_probs

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ffn = PositionWiseFeedForward(d_model, d_ff)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output, _ = self.mha(x, x, x, mask)
        norm1_output = self.norm1(x + self.dropout(attn_output))
        ffn_output = self.ffn(norm1_output)
        output = self.norm2(norm1_output + self.dropout(ffn_output))
        return output

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.masked_mha = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.enc_dec_mha = MultiHeadAttention(d_model, num_heads)
        self.norm2 = nn.LayerNorm(d_model)
        self.ffn = PositionWiseFeedForward(d_model, d_ff)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        masked_attn_output, _ = self.masked_mha(x, x, x, tgt_mask)
        norm1_output = self.norm1(x + self.dropout(masked_attn_output))
        enc_dec_attn_output, _ = self.enc_dec_mha(norm1_output, enc_output, enc_output, src_mask)
        norm2_output = self.norm2(norm1_output + self.dropout(enc_dec_attn_output))
        ffn_output = self.ffn(norm2_output)
        output = self.norm3(norm2_output + self.dropout(ffn_output))
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(1), :].transpose(0, 1)

class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_len):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout)
                                     for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, mask):
        embedded = self.dropout(self.pos_encoding(self.embedding(src)))
        for layer in self.layers:
            embedded = layer(embedded, mask)
        return embedded

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_len):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout)
                                     for _ in range(num_layers)])
        self.fc = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tgt, enc_output, src_mask, tgt_mask):
        embedded = self.dropout(self.pos_encoding(self.embedding(tgt)))
        for layer in self.layers:
            embedded = layer(embedded, enc_output, src_mask, tgt_mask)
        output = self.fc(embedded)
        return output

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_len):
        super(Transformer, self).__init__()
        self.encoder = Encoder(src_vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_len)
        self.decoder = Decoder(tgt_vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_len)

    def make_src_mask(self, src):
        return (src != PAD_INDEX).unsqueeze(1).unsqueeze(2)

    def make_tgt_mask(self, tgt):
        tgt_len = tgt.size(1)
        attn_shape = (1, tgt_len, tgt_len)
        subsequent_mask = torch.tril(torch.ones(attn_shape, device=tgt.device)).type(torch.uint8) # Create subsequent_mask on the same device as tgt
        padding_mask = (tgt != PAD_INDEX).unsqueeze(1).unsqueeze(2)
        return subsequent_mask & padding_mask.bool()

    def forward(self, src, tgt):
        src_mask = self.make_src_mask(src)
        tgt_mask = self.make_tgt_mask(tgt)
        enc_output = self.encoder(src, src_mask)
        output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
        return output

# --- Initialize Model, Optimizer, and Scheduler ---
model = Transformer(updated_vocab_size, updated_vocab_size, D_MODEL, NUM_LAYERS, NUM_HEADS, D_FF, DROPOUT, MAX_LEN)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_dataloader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_INDEX)

# --- Training Loop ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print('\n')
print("Training started...")
print(f"Device: {device}")
print(f"Optimizer: {optimizer}")
print(f"Scheduler: {scheduler}")
print(f"Criterion: {criterion}")
print(f"Total Steps: {total_steps}")
print(f"Warmup Steps: {WARMUP_STEPS}")
print(f"Learning Rate: {LEARNING_RATE}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Number of Epochs: {NUM_EPOCHS}")
print(f"Max Length: {MAX_LEN}")
print(f"Dropout: {DROPOUT}")
print(f"D_Model: {D_MODEL}")
print(f"D_FF: {D_FF}")
print(f"Number of Layers: {NUM_LAYERS}")
print(f"Number of Heads: {NUM_HEADS}")
print(f"Gradient Clipping: {GRADIENT_CLIPPING}")
print(f"Warmup Steps: {WARMUP_STEPS}")
print(f"Total Training Steps: {total_steps}")
print(f"Vocabulary Size: {vocab_size}")
print(f"Updated Vocabulary Size: {updated_vocab_size}")
print(f'Size of train_dataloader: {len(train_dataloader)}')
print(f'Size of test_dataloader: {len(test_dataloader)}')
print('\n')






for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc=f"Epoch {epoch+1}")
    for batch_idx, (src, tgt_in, tgt_out) in progress_bar:
        src = src.to(device)
        tgt_in = tgt_in.to(device)
        tgt_out = tgt_out.to(device)

        optimizer.zero_grad()
        output = model(src, tgt_in)  # (batch_size, tgt_len, vocab_size)

        # Reshape for loss calculation
        output = output.view(-1, output.size(-1)) # (batch_size * tgt_len, vocab_size)
        tgt_out = tgt_out.view(-1) # (batch_size * tgt_len)

        loss = criterion(output, tgt_out)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIPPING)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix({"loss": loss.item()})

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} Training Loss: {avg_loss:.4f}")

    # --- Evaluation Loop ---
    model.eval()
    eval_loss = 0

    # Add tqdm to the evaluation loop
    eval_progress_bar = tqdm(enumerate(test_dataloader), total=len(test_dataloader), desc=f"Epoch {epoch+1} Evaluation")

    with torch.no_grad():
        for batch_idx, (src, tgt_in, tgt_out) in eval_progress_bar:
            src = src.to(device)
            tgt_in = tgt_in.to(device)
            tgt_out = tgt_out.to(device)

            output = model(src, tgt_in)
            output = output.view(-1, output.size(-1))
            tgt_out = tgt_out.view(-1)
            loss = criterion(output, tgt_out)
            eval_loss += loss.item()

    avg_eval_loss = eval_loss / len(test_dataloader)
    print(f"Epoch {epoch+1} Evaluation Loss: {avg_eval_loss:.4f}")

# --- Inference Function (Basic) ---
def translate_sentence(model, sentence, word_to_index, index_to_word, max_len, device):
    model.eval()
    tokens = [word_to_index.get(word.lower(), UNK_INDEX) for word in sentence.lower().split()]
    src_tokens = [START_INDEX] + tokens + [END_INDEX]
    src_tokens = src_tokens[:max_len]
    src_padding = [PAD_INDEX] * (max_len - len(src_tokens))
    src_tensor = torch.tensor(src_tokens + src_padding).unsqueeze(0).to(device)
    src_mask = model.make_src_mask(src_tensor)

    memory = model.encoder(src_tensor, src_mask)
    tgt_tokens = [START_INDEX]
    for _ in range(max_len - 1):
        tgt_tensor = torch.tensor(tgt_tokens).unsqueeze(0).to(device)
        tgt_mask = model.make_tgt_mask(tgt_tensor)
        output = model.decoder(tgt_tensor, memory, src_mask, tgt_mask)
        pred_token = output.argmax(2)[:, -1].item()
        if pred_token == END_INDEX:
            break
        tgt_tokens.append(pred_token)

    translated_words = [index_to_word[token] for token in tgt_tokens if token not in [START_INDEX, END_INDEX, PAD_INDEX]]
    return " ".join(translated_words)



Training started...
Device: cuda
Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 0.0
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x787249169090>
Criterion: CrossEntropyLoss()
Total Steps: 23400
Warmup Steps: 1000
Learning Rate: 5e-05
Batch Size: 32
Number of Epochs: 100
Max Length: 128
Dropout: 0.1
D_Model: 256
D_FF: 512
Number of Layers: 3
Number of Heads: 8
Gradient Clipping: 1.0
Warmup Steps: 1000
Total Training Steps: 23400
Vocabulary Size: 49237
Updated Vocabulary Size: 49241
Size of train_dataloader: 234
Size of test_dataloader: 42




Epoch 1: 100%|██████████| 234/234 [00:22<00:00, 10.46it/s, loss=9.86]


Epoch 1 Training Loss: 10.5165


Epoch 1 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.56it/s]


Epoch 1 Evaluation Loss: 9.8905


Epoch 2: 100%|██████████| 234/234 [00:22<00:00, 10.45it/s, loss=8.72]


Epoch 2 Training Loss: 9.3182


Epoch 2 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.68it/s]


Epoch 2 Evaluation Loss: 8.7918


Epoch 3: 100%|██████████| 234/234 [00:22<00:00, 10.46it/s, loss=7.53]


Epoch 3 Training Loss: 8.0671


Epoch 3 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.56it/s]


Epoch 3 Evaluation Loss: 7.5804


Epoch 4: 100%|██████████| 234/234 [00:22<00:00, 10.37it/s, loss=6.92]


Epoch 4 Training Loss: 7.1488


Epoch 4 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.72it/s]


Epoch 4 Evaluation Loss: 7.0628


Epoch 5: 100%|██████████| 234/234 [00:22<00:00, 10.31it/s, loss=6.49]


Epoch 5 Training Loss: 6.6946


Epoch 5 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.55it/s]


Epoch 5 Evaluation Loss: 6.6831


Epoch 6: 100%|██████████| 234/234 [00:22<00:00, 10.32it/s, loss=6.28]


Epoch 6 Training Loss: 6.3713


Epoch 6 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.46it/s]


Epoch 6 Evaluation Loss: 6.4499


Epoch 7: 100%|██████████| 234/234 [00:22<00:00, 10.24it/s, loss=6.1]


Epoch 7 Training Loss: 6.1448


Epoch 7 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.47it/s]


Epoch 7 Evaluation Loss: 6.2883


Epoch 8: 100%|██████████| 234/234 [00:22<00:00, 10.29it/s, loss=5.59]


Epoch 8 Training Loss: 5.9646


Epoch 8 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.23it/s]


Epoch 8 Evaluation Loss: 6.1542


Epoch 9: 100%|██████████| 234/234 [00:22<00:00, 10.24it/s, loss=5.49]


Epoch 9 Training Loss: 5.8120


Epoch 9 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.24it/s]


Epoch 9 Evaluation Loss: 6.0496


Epoch 10: 100%|██████████| 234/234 [00:22<00:00, 10.27it/s, loss=5.79]


Epoch 10 Training Loss: 5.6814


Epoch 10 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.36it/s]


Epoch 10 Evaluation Loss: 5.9586


Epoch 11: 100%|██████████| 234/234 [00:22<00:00, 10.28it/s, loss=5.74]


Epoch 11 Training Loss: 5.5643


Epoch 11 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.41it/s]


Epoch 11 Evaluation Loss: 5.8836


Epoch 12: 100%|██████████| 234/234 [00:22<00:00, 10.27it/s, loss=5.56]


Epoch 12 Training Loss: 5.4578


Epoch 12 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.15it/s]


Epoch 12 Evaluation Loss: 5.8192


Epoch 13: 100%|██████████| 234/234 [00:22<00:00, 10.22it/s, loss=5.17]


Epoch 13 Training Loss: 5.3618


Epoch 13 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.84it/s]


Epoch 13 Evaluation Loss: 5.7453


Epoch 14: 100%|██████████| 234/234 [00:22<00:00, 10.22it/s, loss=5.28]


Epoch 14 Training Loss: 5.2741


Epoch 14 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.14it/s]


Epoch 14 Evaluation Loss: 5.6863


Epoch 15: 100%|██████████| 234/234 [00:22<00:00, 10.22it/s, loss=5.04]


Epoch 15 Training Loss: 5.1900


Epoch 15 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.96it/s]


Epoch 15 Evaluation Loss: 5.6405


Epoch 16: 100%|██████████| 234/234 [00:22<00:00, 10.25it/s, loss=5.08]


Epoch 16 Training Loss: 5.1143


Epoch 16 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.16it/s]


Epoch 16 Evaluation Loss: 5.5923


Epoch 17: 100%|██████████| 234/234 [00:22<00:00, 10.23it/s, loss=5.14]


Epoch 17 Training Loss: 5.0387


Epoch 17 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.99it/s]


Epoch 17 Evaluation Loss: 5.5604


Epoch 18: 100%|██████████| 234/234 [00:22<00:00, 10.28it/s, loss=5.01]


Epoch 18 Training Loss: 4.9713


Epoch 18 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.19it/s]


Epoch 18 Evaluation Loss: 5.5289


Epoch 19: 100%|██████████| 234/234 [00:22<00:00, 10.24it/s, loss=5]


Epoch 19 Training Loss: 4.9050


Epoch 19 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.17it/s]


Epoch 19 Evaluation Loss: 5.5015


Epoch 20: 100%|██████████| 234/234 [00:22<00:00, 10.28it/s, loss=4.69]


Epoch 20 Training Loss: 4.8415


Epoch 20 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.11it/s]


Epoch 20 Evaluation Loss: 5.4685


Epoch 21: 100%|██████████| 234/234 [00:22<00:00, 10.29it/s, loss=5.08]


Epoch 21 Training Loss: 4.7842


Epoch 21 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.04it/s]


Epoch 21 Evaluation Loss: 5.4349


Epoch 22: 100%|██████████| 234/234 [00:22<00:00, 10.21it/s, loss=4.53]


Epoch 22 Training Loss: 4.7267


Epoch 22 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.09it/s]


Epoch 22 Evaluation Loss: 5.4134


Epoch 23: 100%|██████████| 234/234 [00:22<00:00, 10.24it/s, loss=4.44]


Epoch 23 Training Loss: 4.6714


Epoch 23 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.16it/s]


Epoch 23 Evaluation Loss: 5.3899


Epoch 24: 100%|██████████| 234/234 [00:22<00:00, 10.23it/s, loss=4.79]


Epoch 24 Training Loss: 4.6201


Epoch 24 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.24it/s]


Epoch 24 Evaluation Loss: 5.3682


Epoch 25: 100%|██████████| 234/234 [00:22<00:00, 10.23it/s, loss=4.6]


Epoch 25 Training Loss: 4.5694


Epoch 25 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.17it/s]


Epoch 25 Evaluation Loss: 5.3540


Epoch 26: 100%|██████████| 234/234 [00:22<00:00, 10.31it/s, loss=4.21]


Epoch 26 Training Loss: 4.5209


Epoch 26 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.19it/s]


Epoch 26 Evaluation Loss: 5.3474


Epoch 27: 100%|██████████| 234/234 [00:22<00:00, 10.24it/s, loss=4.64]


Epoch 27 Training Loss: 4.4730


Epoch 27 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.13it/s]


Epoch 27 Evaluation Loss: 5.3104


Epoch 28: 100%|██████████| 234/234 [00:22<00:00, 10.28it/s, loss=4.3]


Epoch 28 Training Loss: 4.4271


Epoch 28 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.16it/s]


Epoch 28 Evaluation Loss: 5.3001


Epoch 29: 100%|██████████| 234/234 [00:22<00:00, 10.22it/s, loss=4.28]


Epoch 29 Training Loss: 4.3830


Epoch 29 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.31it/s]


Epoch 29 Evaluation Loss: 5.2878


Epoch 30: 100%|██████████| 234/234 [00:22<00:00, 10.28it/s, loss=4.4]


Epoch 30 Training Loss: 4.3393


Epoch 30 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.06it/s]


Epoch 30 Evaluation Loss: 5.2893


Epoch 31: 100%|██████████| 234/234 [00:22<00:00, 10.24it/s, loss=4.66]


Epoch 31 Training Loss: 4.2997


Epoch 31 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.95it/s]


Epoch 31 Evaluation Loss: 5.2766


Epoch 32: 100%|██████████| 234/234 [00:22<00:00, 10.18it/s, loss=4.61]


Epoch 32 Training Loss: 4.2566


Epoch 32 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.18it/s]


Epoch 32 Evaluation Loss: 5.2551


Epoch 33: 100%|██████████| 234/234 [00:22<00:00, 10.24it/s, loss=4.24]


Epoch 33 Training Loss: 4.2160


Epoch 33 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.06it/s]


Epoch 33 Evaluation Loss: 5.2424


Epoch 34: 100%|██████████| 234/234 [00:22<00:00, 10.25it/s, loss=4.57]


Epoch 34 Training Loss: 4.1783


Epoch 34 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.16it/s]


Epoch 34 Evaluation Loss: 5.2457


Epoch 35: 100%|██████████| 234/234 [00:22<00:00, 10.29it/s, loss=4.24]


Epoch 35 Training Loss: 4.1406


Epoch 35 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.01it/s]


Epoch 35 Evaluation Loss: 5.2286


Epoch 36: 100%|██████████| 234/234 [00:22<00:00, 10.23it/s, loss=4.34]


Epoch 36 Training Loss: 4.1045


Epoch 36 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.10it/s]


Epoch 36 Evaluation Loss: 5.2312


Epoch 37: 100%|██████████| 234/234 [00:22<00:00, 10.25it/s, loss=3.85]


Epoch 37 Training Loss: 4.0666


Epoch 37 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.98it/s]


Epoch 37 Evaluation Loss: 5.2328


Epoch 38: 100%|██████████| 234/234 [00:22<00:00, 10.21it/s, loss=3.89]


Epoch 38 Training Loss: 4.0316


Epoch 38 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.96it/s]


Epoch 38 Evaluation Loss: 5.2122


Epoch 39: 100%|██████████| 234/234 [00:22<00:00, 10.24it/s, loss=4.09]


Epoch 39 Training Loss: 3.9988


Epoch 39 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.94it/s]


Epoch 39 Evaluation Loss: 5.2031


Epoch 40: 100%|██████████| 234/234 [00:22<00:00, 10.25it/s, loss=3.98]


Epoch 40 Training Loss: 3.9639


Epoch 40 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.17it/s]


Epoch 40 Evaluation Loss: 5.2098


Epoch 41: 100%|██████████| 234/234 [00:22<00:00, 10.22it/s, loss=3.95]


Epoch 41 Training Loss: 3.9315


Epoch 41 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.13it/s]


Epoch 41 Evaluation Loss: 5.1949


Epoch 42: 100%|██████████| 234/234 [00:22<00:00, 10.22it/s, loss=3.67]


Epoch 42 Training Loss: 3.9016


Epoch 42 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.97it/s]


Epoch 42 Evaluation Loss: 5.1944


Epoch 43: 100%|██████████| 234/234 [00:22<00:00, 10.21it/s, loss=3.76]


Epoch 43 Training Loss: 3.8675


Epoch 43 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.14it/s]


Epoch 43 Evaluation Loss: 5.1988


Epoch 44: 100%|██████████| 234/234 [00:22<00:00, 10.25it/s, loss=3.55]


Epoch 44 Training Loss: 3.8385


Epoch 44 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.93it/s]


Epoch 44 Evaluation Loss: 5.1940


Epoch 45: 100%|██████████| 234/234 [00:22<00:00, 10.26it/s, loss=4.02]


Epoch 45 Training Loss: 3.8095


Epoch 45 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.94it/s]


Epoch 45 Evaluation Loss: 5.1704


Epoch 46: 100%|██████████| 234/234 [00:22<00:00, 10.24it/s, loss=3.98]


Epoch 46 Training Loss: 3.7822


Epoch 46 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.14it/s]


Epoch 46 Evaluation Loss: 5.1886


Epoch 47: 100%|██████████| 234/234 [00:22<00:00, 10.25it/s, loss=3.75]


Epoch 47 Training Loss: 3.7513


Epoch 47 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.11it/s]


Epoch 47 Evaluation Loss: 5.1775


Epoch 48: 100%|██████████| 234/234 [00:22<00:00, 10.25it/s, loss=3.74]


Epoch 48 Training Loss: 3.7238


Epoch 48 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.98it/s]


Epoch 48 Evaluation Loss: 5.1910


Epoch 49: 100%|██████████| 234/234 [00:22<00:00, 10.22it/s, loss=3.62]


Epoch 49 Training Loss: 3.6953


Epoch 49 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.92it/s]


Epoch 49 Evaluation Loss: 5.1727


Epoch 50: 100%|██████████| 234/234 [00:22<00:00, 10.26it/s, loss=3.51]


Epoch 50 Training Loss: 3.6698


Epoch 50 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.97it/s]


Epoch 50 Evaluation Loss: 5.1836


Epoch 51: 100%|██████████| 234/234 [00:22<00:00, 10.24it/s, loss=3.64]


Epoch 51 Training Loss: 3.6447


Epoch 51 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.22it/s]


Epoch 51 Evaluation Loss: 5.1910


Epoch 52: 100%|██████████| 234/234 [00:22<00:00, 10.23it/s, loss=3.41]


Epoch 52 Training Loss: 3.6176


Epoch 52 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.94it/s]


Epoch 52 Evaluation Loss: 5.1925


Epoch 53: 100%|██████████| 234/234 [00:22<00:00, 10.19it/s, loss=3.47]


Epoch 53 Training Loss: 3.5931


Epoch 53 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.04it/s]


Epoch 53 Evaluation Loss: 5.1688


Epoch 54: 100%|██████████| 234/234 [00:22<00:00, 10.18it/s, loss=3.66]


Epoch 54 Training Loss: 3.5690


Epoch 54 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.12it/s]


Epoch 54 Evaluation Loss: 5.1737


Epoch 55: 100%|██████████| 234/234 [00:23<00:00, 10.15it/s, loss=3.46]


Epoch 55 Training Loss: 3.5471


Epoch 55 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.91it/s]


Epoch 55 Evaluation Loss: 5.1727


Epoch 56: 100%|██████████| 234/234 [00:22<00:00, 10.24it/s, loss=3.62]


Epoch 56 Training Loss: 3.5243


Epoch 56 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.08it/s]


Epoch 56 Evaluation Loss: 5.1852


Epoch 57: 100%|██████████| 234/234 [00:22<00:00, 10.23it/s, loss=3.82]


Epoch 57 Training Loss: 3.5015


Epoch 57 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.78it/s]


Epoch 57 Evaluation Loss: 5.1631


Epoch 58: 100%|██████████| 234/234 [00:22<00:00, 10.27it/s, loss=3.53]


Epoch 58 Training Loss: 3.4805


Epoch 58 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.91it/s]


Epoch 58 Evaluation Loss: 5.1656


Epoch 59: 100%|██████████| 234/234 [00:22<00:00, 10.21it/s, loss=3.39]


Epoch 59 Training Loss: 3.4607


Epoch 59 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.99it/s]


Epoch 59 Evaluation Loss: 5.1674


Epoch 60: 100%|██████████| 234/234 [00:22<00:00, 10.21it/s, loss=3.63]


Epoch 60 Training Loss: 3.4390


Epoch 60 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.13it/s]


Epoch 60 Evaluation Loss: 5.1593


Epoch 61: 100%|██████████| 234/234 [00:22<00:00, 10.27it/s, loss=3.42]


Epoch 61 Training Loss: 3.4186


Epoch 61 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.12it/s]


Epoch 61 Evaluation Loss: 5.1733


Epoch 62: 100%|██████████| 234/234 [00:22<00:00, 10.22it/s, loss=3.42]


Epoch 62 Training Loss: 3.4004


Epoch 62 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.10it/s]


Epoch 62 Evaluation Loss: 5.1634


Epoch 63: 100%|██████████| 234/234 [00:22<00:00, 10.23it/s, loss=3.18]


Epoch 63 Training Loss: 3.3793


Epoch 63 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.04it/s]


Epoch 63 Evaluation Loss: 5.1712


Epoch 64: 100%|██████████| 234/234 [00:22<00:00, 10.27it/s, loss=3.52]


Epoch 64 Training Loss: 3.3627


Epoch 64 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.14it/s]


Epoch 64 Evaluation Loss: 5.1577


Epoch 65: 100%|██████████| 234/234 [00:22<00:00, 10.26it/s, loss=3.43]


Epoch 65 Training Loss: 3.3447


Epoch 65 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.08it/s]


Epoch 65 Evaluation Loss: 5.1722


Epoch 66: 100%|██████████| 234/234 [00:22<00:00, 10.28it/s, loss=3.33]


Epoch 66 Training Loss: 3.3298


Epoch 66 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.05it/s]


Epoch 66 Evaluation Loss: 5.1710


Epoch 67: 100%|██████████| 234/234 [00:22<00:00, 10.20it/s, loss=3.42]


Epoch 67 Training Loss: 3.3092


Epoch 67 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.98it/s]


Epoch 67 Evaluation Loss: 5.1661


Epoch 68: 100%|██████████| 234/234 [00:22<00:00, 10.23it/s, loss=3.24]


Epoch 68 Training Loss: 3.2936


Epoch 68 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.98it/s]


Epoch 68 Evaluation Loss: 5.1587


Epoch 69: 100%|██████████| 234/234 [00:23<00:00, 10.15it/s, loss=3.35]


Epoch 69 Training Loss: 3.2802


Epoch 69 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.10it/s]


Epoch 69 Evaluation Loss: 5.1718


Epoch 70: 100%|██████████| 234/234 [00:22<00:00, 10.22it/s, loss=3.23]


Epoch 70 Training Loss: 3.2632


Epoch 70 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.02it/s]


Epoch 70 Evaluation Loss: 5.1686


Epoch 71: 100%|██████████| 234/234 [00:22<00:00, 10.22it/s, loss=3.46]


Epoch 71 Training Loss: 3.2482


Epoch 71 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.11it/s]


Epoch 71 Evaluation Loss: 5.1655


Epoch 72: 100%|██████████| 234/234 [00:22<00:00, 10.19it/s, loss=3.38]


Epoch 72 Training Loss: 3.2328


Epoch 72 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.07it/s]


Epoch 72 Evaluation Loss: 5.1690


Epoch 73: 100%|██████████| 234/234 [00:22<00:00, 10.24it/s, loss=3.1]


Epoch 73 Training Loss: 3.2189


Epoch 73 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.83it/s]


Epoch 73 Evaluation Loss: 5.1671


Epoch 74: 100%|██████████| 234/234 [00:22<00:00, 10.22it/s, loss=3.45]


Epoch 74 Training Loss: 3.2068


Epoch 74 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.98it/s]


Epoch 74 Evaluation Loss: 5.1623


Epoch 75: 100%|██████████| 234/234 [00:22<00:00, 10.19it/s, loss=3.14]


Epoch 75 Training Loss: 3.1942


Epoch 75 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.78it/s]


Epoch 75 Evaluation Loss: 5.1664


Epoch 76: 100%|██████████| 234/234 [00:22<00:00, 10.18it/s, loss=3.17]


Epoch 76 Training Loss: 3.1809


Epoch 76 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.07it/s]


Epoch 76 Evaluation Loss: 5.1693


Epoch 77: 100%|██████████| 234/234 [00:22<00:00, 10.21it/s, loss=3.46]


Epoch 77 Training Loss: 3.1691


Epoch 77 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.06it/s]


Epoch 77 Evaluation Loss: 5.1713


Epoch 78: 100%|██████████| 234/234 [00:22<00:00, 10.21it/s, loss=3.31]


Epoch 78 Training Loss: 3.1577


Epoch 78 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.06it/s]


Epoch 78 Evaluation Loss: 5.1663


Epoch 79: 100%|██████████| 234/234 [00:22<00:00, 10.24it/s, loss=3.19]


Epoch 79 Training Loss: 3.1465


Epoch 79 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.05it/s]


Epoch 79 Evaluation Loss: 5.1620


Epoch 80: 100%|██████████| 234/234 [00:22<00:00, 10.24it/s, loss=2.98]


Epoch 80 Training Loss: 3.1365


Epoch 80 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.93it/s]


Epoch 80 Evaluation Loss: 5.1761


Epoch 81: 100%|██████████| 234/234 [00:22<00:00, 10.21it/s, loss=3.05]


Epoch 81 Training Loss: 3.1253


Epoch 81 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.15it/s]


Epoch 81 Evaluation Loss: 5.1677


Epoch 82: 100%|██████████| 234/234 [00:22<00:00, 10.23it/s, loss=3.33]


Epoch 82 Training Loss: 3.1171


Epoch 82 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.02it/s]


Epoch 82 Evaluation Loss: 5.1565


Epoch 83: 100%|██████████| 234/234 [00:22<00:00, 10.26it/s, loss=3.13]


Epoch 83 Training Loss: 3.1081


Epoch 83 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.95it/s]


Epoch 83 Evaluation Loss: 5.1695


Epoch 84: 100%|██████████| 234/234 [00:23<00:00, 10.16it/s, loss=3.03]


Epoch 84 Training Loss: 3.0994


Epoch 84 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.98it/s]


Epoch 84 Evaluation Loss: 5.1633


Epoch 85: 100%|██████████| 234/234 [00:22<00:00, 10.22it/s, loss=2.91]


Epoch 85 Training Loss: 3.0908


Epoch 85 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.20it/s]


Epoch 85 Evaluation Loss: 5.1713


Epoch 86: 100%|██████████| 234/234 [00:22<00:00, 10.22it/s, loss=3.12]


Epoch 86 Training Loss: 3.0833


Epoch 86 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.04it/s]


Epoch 86 Evaluation Loss: 5.1683


Epoch 87: 100%|██████████| 234/234 [00:22<00:00, 10.24it/s, loss=3.08]


Epoch 87 Training Loss: 3.0782


Epoch 87 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.99it/s]


Epoch 87 Evaluation Loss: 5.1661


Epoch 88: 100%|██████████| 234/234 [00:22<00:00, 10.25it/s, loss=2.92]


Epoch 88 Training Loss: 3.0715


Epoch 88 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.26it/s]


Epoch 88 Evaluation Loss: 5.1697


Epoch 89: 100%|██████████| 234/234 [00:22<00:00, 10.19it/s, loss=3.02]


Epoch 89 Training Loss: 3.0633


Epoch 89 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.11it/s]


Epoch 89 Evaluation Loss: 5.1616


Epoch 90: 100%|██████████| 234/234 [00:22<00:00, 10.27it/s, loss=2.99]


Epoch 90 Training Loss: 3.0584


Epoch 90 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.99it/s]


Epoch 90 Evaluation Loss: 5.1655


Epoch 91: 100%|██████████| 234/234 [00:22<00:00, 10.20it/s, loss=3.2]


Epoch 91 Training Loss: 3.0539


Epoch 91 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.02it/s]


Epoch 91 Evaluation Loss: 5.1696


Epoch 92: 100%|██████████| 234/234 [00:22<00:00, 10.19it/s, loss=2.85]


Epoch 92 Training Loss: 3.0492


Epoch 92 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.94it/s]


Epoch 92 Evaluation Loss: 5.1702


Epoch 93: 100%|██████████| 234/234 [00:23<00:00, 10.16it/s, loss=3.15]


Epoch 93 Training Loss: 3.0434


Epoch 93 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.78it/s]


Epoch 93 Evaluation Loss: 5.1682


Epoch 94: 100%|██████████| 234/234 [00:23<00:00, 10.17it/s, loss=2.7]


Epoch 94 Training Loss: 3.0395


Epoch 94 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.17it/s]


Epoch 94 Evaluation Loss: 5.1659


Epoch 95: 100%|██████████| 234/234 [00:23<00:00, 10.17it/s, loss=2.85]


Epoch 95 Training Loss: 3.0383


Epoch 95 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.00it/s]


Epoch 95 Evaluation Loss: 5.1678


Epoch 96: 100%|██████████| 234/234 [00:23<00:00, 10.16it/s, loss=3.07]


Epoch 96 Training Loss: 3.0343


Epoch 96 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.13it/s]


Epoch 96 Evaluation Loss: 5.1677


Epoch 97: 100%|██████████| 234/234 [00:23<00:00, 10.16it/s, loss=2.91]


Epoch 97 Training Loss: 3.0336


Epoch 97 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.15it/s]


Epoch 97 Evaluation Loss: 5.1663


Epoch 98: 100%|██████████| 234/234 [00:22<00:00, 10.27it/s, loss=3.07]


Epoch 98 Training Loss: 3.0275


Epoch 98 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.07it/s]


Epoch 98 Evaluation Loss: 5.1686


Epoch 99: 100%|██████████| 234/234 [00:22<00:00, 10.20it/s, loss=2.76]


Epoch 99 Training Loss: 3.0283


Epoch 99 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 32.12it/s]


Epoch 99 Evaluation Loss: 5.1674


Epoch 100: 100%|██████████| 234/234 [00:22<00:00, 10.22it/s, loss=2.84]


Epoch 100 Training Loss: 3.0294


Epoch 100 Evaluation: 100%|██████████| 42/42 [00:01<00:00, 31.92it/s]

Epoch 100 Evaluation Loss: 5.1663





In [7]:
# --- Example Inference ---
if __name__ == '__main__':
    # Example question from the dataset
    sample_question = test_dataset[10]['question']
    actual_answer = test_dataset[10]['answer']

    translated_answer = translate_sentence(model, sample_question, word_to_index, index_to_word, MAX_LEN, device)

    print("\n--- Example Inference ---")
    print(f"Question: {sample_question}")
    print(f"Actual Answer: {actual_answer}")
    print(f"Predicted Answer: {translated_answer}")

    # Note: The model is likely not well-trained with these hyperparameters and few epochs.
    # The predicted answer will likely be poor without significant training.


--- Example Inference ---
Question: A new program had 60 downloads in the first month. The number of downloads in the second month was three times as many as the downloads in the first month, but then reduced by 30% in the third month. How many downloads did the program have total over the three months?
Actual Answer: The number of downloads of the program in the second month increased to 3*60 = <<3*60=180>>180
In the first two months, the total number of downloads of the program was 180+60 = <<180+60=240>>240
In the third month, the number of downloads of the program reduced by 30/100*180 = <<30/100*180=54>>54
There were 180-54 = <<180-54=126>>126 downloads in the third month.
In the three months, the total number of downloads of the program was 126+240 = <<126+240=366>>366
#### 366
Predicted Answer: in the first year, the second year, the second year, was three times the second year, the second year, the second year, the second year, the second year, was reduced = <<2*500=1000>>1000