In [None]:
import os
import json
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from transformers import BertTokenizer, AutoTokenizer
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from glob import glob
from tqdm import tqdm
from collections import Counter
import re
import math
import matplotlib.pyplot as plt
import decimal
from torch.nn.utils.rnn import pad_sequence
from torch.cuda.amp import GradScaler, autocast
import matplotlib.pyplot as plt
from transformers import get_cosine_schedule_with_warmup

In [None]:
# Device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# This function parses VTT files and extracts the start time, end time, and text.
def parse_vtt(vtt_path, video_id):
    # Function to convert time string to seconds
    def time_str_to_seconds(time_str):
        h, m, s = time_str.split(":")
        s, ms = s.split(".")
        return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000

    # Read the VTT file
    with open(vtt_path, "r", encoding="utf-8") as f:
        vtt_text = f.read()

    # Remove WebVTT header 
    blocks = re.split(r'\n\n+', vtt_text.strip())
    # Create a list to store entries
    entries = []

    # Iterate through each block and extract the start time, end time, and text
    for block in blocks:
        # Strip whitespace and split by lines
        lines = block.strip().splitlines()
        # If the block has at least 2 lines and the first line contains "-->", it's a valid entry
        if len(lines) >= 2 and "-->" in lines[0]:
            # Extract start time, end time, text and video_id to distinguish overlapping times
            start, end = lines[0].split(" --> ")
            text = " ".join(lines[1:]).strip()
            entries.append({
                "start": time_str_to_seconds(start.strip()),
                "end": time_str_to_seconds(end.strip()),
                "text": text,
                "video_id": video_id 
            })

    # Return the list of entries
    return entries

In [None]:
# This function extracts the video ID from the file path.
def extract_video_id(path):
    return os.path.basename(path).split("_keypoints")[0]

In [None]:
# path to the keypoints and subtitles files and create a list of files
keypoints_path = "<keypoints_path_here>"
keypoints_files = glob(os.path.join(keypoints_path, "*_keypoints.pth"))
subtitles_path = "<subtitles_path_here>"
subtitles_files = glob(os.path.join(subtitles_path, "*.vtt"))

print(len(glob(keypoints_path)))

# Create a list to store keypoints and subtitles
keypoints = []
subtitles = []

# Iterate through the keypoints files and load them
for k in keypoints_files:
  base_name = os.path.basename(k).replace("_keypoints.pth", "")
  print(k)
  temp_keypoints = torch.load(k)
  # Append the video ID and keypoints to the list
  keypoints.append((extract_video_id(k), temp_keypoints))

counter  = 0

# Iterate through the subtitles files and parse them
for s in subtitles_files:
    print(s)
    base_name = os.path.basename(s).replace(".vtt", "")
    parsed_subs = parse_vtt(s, base_name)
    # Append the video ID and parsed subtitles to the list
    subtitles.extend(parsed_subs)

print("finished")

In [None]:

# Load emotion tensor from JSON file
def load_emotion_tensor_from_json(json_path, video_id):
    # Read the JSON file
    with open(json_path, 'r') as f:
        data = json.load(f)

    print(len(data))

    # Create a list to store the normalized tensors
    normalized = []

    # Iterate through the data 
    for i, frame in enumerate(data):
        # Check if the frame is empty or all zeros
        if not frame or all(v == 0 for v in frame):
            # Padd with zeros for no emotion data or all-zero frame
            normalized.append(torch.zeros(7, dtype=torch.float32))
        else:
            # Convert using Decimal for precision, but store as float32 tensor
            values = [float(decimal.Decimal(str(x))) for x in frame]
            tensor = torch.tensor(values, dtype=torch.float32)
            normalized.append(tensor)

    return (video_id, torch.stack(normalized))

In [None]:
# Set the precision for printing tensors
torch.set_printoptions(precision=10, sci_mode=True)

# Path to the emotion tensors and create a list of files
emotions_path = "<emotions_path_here>"
emotions_files = glob(os.path.join(emotions_path, "*"))

print(len(emotions_files))

# Create a list to store emotion tensors
emotions = []

counter = 0

# Iterate through the emotion files and load them
for e in emotions_files:
  print(e)
  base_name = os.path.basename(e).replace(".json", "")
  video_id, emotion_tensor = load_emotion_tensor_from_json(e, base_name)
  # Append the video ID and emotion tensor to the list
  emotions.append((video_id, emotion_tensor))

In [None]:
# path to the keypoints and subtitles files and create a list of files
keypoints_path = "<keypoints_path_here>"
keypoints_files = glob(os.path.join(keypoints_path, "*_keypoints.pth"))
subtitles_path = "<subtitles_path_here>"
subtitles_files = glob(os.path.join(subtitles_path, "*.vtt"))

print(len(glob(keypoints_path)))

# Create a list to store test keypoints and subtitles
keypoints_test = []
subtitles_test = []

# Iterate through the keypoints files and load them
for k in keypoints_files:
  print(k)
  base_name = os.path.basename(k).replace("_keypoints.pth", "")
  temp_keypoints = torch.load(k)
  # Append the video ID and keypoints to the list
  keypoints_test.append((extract_video_id(k), temp_keypoints))

# Iterate through the subtitles files and parse them
for s in subtitles_files:
    print(s)
    base_name = os.path.basename(s).replace(".vtt", "")
    parsed_subs = parse_vtt(s, base_name)
    # Append the video ID and parsed subtitles to the list
    subtitles_test.extend(parsed_subs)

print("finished")

In [None]:
# Path to the emotion tensors and create a list of files
emotions_path = "<emotions_path_here>"
emotions_files = glob(os.path.join(emotions_path, "*"))

print(len(emotions_files))

# Create a list to store test emotion tensors
emotions_test = []

# Iterate through the emotion files and load them
for e in emotions_files:
    print(e)
    base_name = os.path.basename(e).replace(".json", "")
    tensor = load_emotion_tensor_from_json(e, base_name)
    # Append the video ID and emotion tensor to the list
    emotions_test.append(load_emotion_tensor_from_json(e, base_name))

In [None]:
# Function to convert subtitle times to seconds
def time_to_float(time_str):

    # Split the time string into hours, minutes, and seconds
    hours, minutes, seconds = time_str.split(':')

    # Convert to float
    hours = float(hours)
    minutes = float(minutes)
    seconds = float(seconds)

    # Convert to total seconds
    total_seconds = hours * 3600 + minutes * 60 + seconds

    # Return the total seconds as a float
    return total_seconds

In [None]:
# Dataset class for keypoints, emotions and subtitles
class EmotionKeypointsAndSubtitlesDataset(Dataset):
    def __init__(self, keypoints_data, subtitle_entries, emotion_data, fps=25, tokenizer=None, max_length=80, num_joints=25):
        # Set the fps
        self.fps = fps
        # Set the maximum length for tokenization
        self.max_length = max_length
        # Set the number of joints for keypoints
        self.num_joints = num_joints
        # Set the tokenizer for text processing
        self.tokenizer = tokenizer or AutoTokenizer.from_pretrained("bert-base-uncased")
        # Create a dictionary for keypoints, emotion and subtitles sample data
        self.samples = self.build_samples(keypoints_data, subtitle_entries, emotion_data)

    # Build samples from keypoints, subtitles, and emotions
    def build_samples(self, keypoints_data, subtitles, emotion_data):
        # Create a dictionary for samples
        samples = []

        # Iterate through the subtitles and extract keypoints and emotions
        for sub in subtitles:
            # Extract video ID, start time, and end time
            video_id = sub["video_id"]
            start_frame = int(sub["start"] * self.fps)
            end_frame = int(sub["end"] * self.fps)

            # Check if the video ID exists in keypoints and emotion data
            if video_id not in keypoints_data or video_id not in emotion_data:
                continue

            # Extract keypoints and emotions for the video ID
            video_kps = keypoints_data[video_id]
            video_emotions = emotion_data[video_id]

            # Check if the start and end frames are within the bounds of the keypoints and emotions
            if end_frame > len(video_kps) or end_frame > len(video_emotions):
                continue

            # Extract the keypoints and emotions for the specified time range
            keypoints_seq = video_kps[start_frame:end_frame]
            emotion_seq = video_emotions[start_frame:end_frame]

            # Check if the sequences are empty
            if len(keypoints_seq) == 0 or len(emotion_seq) == 0:
                continue

            # Process keypoints
            processed_kps = []
            # Iterate through the keypoints 
            for frame in keypoints_seq:
                # Flatten the keypoints and pad with zeros if necessary
                frame_tensor = torch.zeros(self.num_joints * 3)
                # Check if the frame is empty 
                if len(frame) > 0:
                    # Get the first person in the frame
                    person = frame[0]
                    # Flatten the keypoints and pad with zeros if necessary
                    flat_kps = [coord for part in person for joint in part for coord in joint]
                    flat_kps = flat_kps[:self.num_joints * 3] + [0] * max(0, self.num_joints * 3 - len(flat_kps))
                    # Convert to tensor
                    frame_tensor = torch.tensor(flat_kps, dtype=torch.float32)
                # Append the tensor to the list
                processed_kps.append(frame_tensor)

            # Process emotions
            processed_emo = []
            # Iterate through the emotions
            for emo in emotion_seq:
                # Convert to tensor
                emo_tensor = torch.tensor(emo, dtype=torch.float32)
                # Append the tensor to the list
                processed_emo.append(emo_tensor)

            # Check if the lengths of keypoints and emotions match
            if len(processed_emo) != len(processed_kps):
                continue

            # Pad the sequences to the maximum length
            text = sub["text"]
            tokenized = self.tokenizer(text, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")["input_ids"].squeeze(0)

            # Append the sample to the list
            samples.append((torch.stack(processed_kps), torch.stack(processed_emo), tokenized))

        # Return the list of samples
        return samples

    # Get the length of the dataset
    def __len__(self):
        return len(self.samples)

    # Get a sample by index
    def __getitem__(self, idx):
        return self.samples[idx]

In [None]:
# Define the tokenizer for text processing
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Create the dictionary for keypoints and emotions
keypoints_dict = dict(keypoints) 
emotions_dict = dict(emotions) 

# Create the dataset for training
emotion_train_dataset = EmotionKeypointsAndSubtitlesDataset(keypoints_dict, subtitles, emotions_dict, fps=25, tokenizer=tokenizer)

# Print the number of samples in the dataset
print("Dataset size:", len(emotion_train_dataset))

# Print the shape of the first sample in the dataset
k, e, s = emotion_train_dataset[0]
print("Keypoints Shape:", k.shape) 
print("Subtitles Shape:", s.shape)
print("Emotions Shape: ", e.shape)

In [None]:
# Create the dictionary for keypoints and emotions for testing
keypoints_test_dict = dict(keypoints_test) 
emotions_test_dict = dict(emotions_test)

# Create the dataset for testing
emotion_test_dataset = EmotionKeypointsAndSubtitlesDataset(keypoints_test_dict, subtitles_test, emotions_test_dict, fps=25, tokenizer=tokenizer)

# Print the number of samples in the test dataset
print("Dataset size:", len(emotion_test_dataset))

# Print the shape of the first sample in the test dataset
k, s, e = emotion_test_dataset[0]
print("Keypoints Shape:", k.shape) 
print("Subtitles Shape:", s.shape)
print("Emotions Shape: ", e.shape)

In [None]:
# Function to collate and pad the samples into batches
def emotion_collate_fn(batch):
    k, e, s = zip(*batch)

    # Pad the keypoints, emotions, and subtitles sequences
    keypoints_padded = pad_sequence(k, batch_first=True, padding_value=0.0)  # [B, T, 75]
    emotions_padded = pad_sequence(e, batch_first=True, padding_value=0.0)    # [B, T, 7]
    subtitles_padded = pad_sequence(s, batch_first=True, padding_value=0)    # [B, L]

    # Return the padded sequences
    return keypoints_padded, emotions_padded, subtitles_padded

In [None]:
# Create the DataLoader for training
emotion_train_loader = DataLoader(emotion_train_dataset, batch_size=128, shuffle=True, collate_fn=emotion_collate_fn, pin_memory=True)

In [None]:
# Create the DataLoader for testing
emotions_test_loader = DataLoader(emotion_test_dataset, batch_size=128, shuffle=True, collate_fn=emotion_collate_fn, pin_memory=True)

In [None]:
# Define the EmotionEncoderDecoderTransformer model
class EmotionEncoderDecoderTransformer(nn.Module):
    def __init__(self,
                 num_joints=25,
                 kp_input_dim=3,
                 emo_input_dim=7,
                 hidden_size=256,
                 num_layers=4,
                 nhead=8,
                 ff_dim=512,
                 dropout=0.1,
                 max_len=1024,
                 vocab_size=30522):
        super().__init__()

        # Keypoints encoder
        self.kp_fc = nn.Linear(num_joints * kp_input_dim, hidden_size)

        # Emotions encoder
        self.emotion_fc = nn.Linear(emo_input_dim, hidden_size)

        # Emotion bias modulator for decoder
        self.emo_to_bias = nn.Linear(emo_input_dim, hidden_size)

        # Positional encoding buffer
        pe = self._build_positional_encoding(max_len, hidden_size)
        self.register_buffer("pos_encoder", pe)  # [max_len, hidden_size]

        # Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size, nhead=nhead,
                                                   dim_feedforward=ff_dim, dropout=dropout, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Token embedding
        self.token_embedding = nn.Embedding(vocab_size, hidden_size)

        # Decoder
        decoder_layer = nn.TransformerDecoderLayer(d_model=hidden_size, nhead=nhead,
                                                   dim_feedforward=ff_dim, dropout=dropout, batch_first=True)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

        # Output projection
        self.output_fc = nn.Linear(hidden_size, vocab_size)

    # Build the positional encoding
    def _build_positional_encoding(self, max_len, hidden_size):
        # Create a positional encoding matrix
        position = torch.arange(0, max_len).unsqueeze(1)
        # Compute the positional encoding using sine and cosine functions
        div_term = torch.exp(torch.arange(0, hidden_size, 2) * -(torch.log(torch.tensor(10000.0)) / hidden_size))
        pe = torch.zeros(max_len, hidden_size)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        # Return the positional encoding matrix
        return pe 

    def forward(self, keypoints, emotions, tgt, tgt_mask=None):
        device = keypoints.device

        # Project keypoints and emotion
        kp_feat = self.kp_fc(keypoints)           # (B, T, H)
        emo_feat = self.emotion_fc(emotions)      # (B, T, H)

        # Encoder input = keypoints + emotion bias + positional encoding
        encoder_input = kp_feat + emo_feat
        pos_enc = self.pos_encoder[:encoder_input.size(1)].to(device)  # (T, H)
        encoder_input = encoder_input + pos_enc.unsqueeze(0)

        # Encode
        memory = self.encoder(encoder_input)

        # Token embeddings for decoder
        tgt_embed = self.token_embedding(tgt).to(device)
        tgt_pos_enc = self.pos_encoder[:tgt.size(1)].to(device)
        tgt_input = tgt_embed + tgt_pos_enc.unsqueeze(0)

        # Apply emotion bias to decoder input (mean emotion across T)
        emotion_bias = self.emo_to_bias(emotions.mean(dim=1))  # (B, H)
        emotion_bias = emotion_bias.unsqueeze(1).expand(-1, tgt_input.size(1), -1)  # (B, T', H)
        tgt_input = tgt_input + emotion_bias

        # Decode
        output = self.decoder(tgt_input, memory, tgt_mask=tgt_mask)
        logits = self.output_fc(output)
        return logits


In [None]:
# Initialize the model and move it to the device
# Only run this when this is the first run
emotion_model = EmotionEncoderDecoderTransformer().to(device) 

In [None]:
# Only run this when loading an existing model
# Recreate the model and load the weights
emotion_model = EmotionEncoderDecoderTransformer().to(device)

# Load the model weights
emotion_model.load_state_dict(torch.load("<path_to_model>", map_location=device))

In [None]:
# Beam search decoding function
def sample_decode_beam(model, keypoints, emotions, tokenizer, beam_width=3, max_len=300, eos_token_id=102):
    # Set the model to evaluation mode
    model.eval()
    # Define the device
    device = keypoints.device
    # Get generated sequences
    generated = [(torch.tensor([tokenizer.cls_token_id], device=device), 0.0)]

    # Iterate for the maximum length of the sequence
    for _ in range(max_len):
        # Create a list to store all candidates
        all_candidates = []
        # Iterate through the generated sequences
        for seq, score in generated:
            # Check if the last token is the end-of-sequence token
            if seq[-1].item() == eos_token_id:
                # If sp add it to the candidates
                all_candidates.append((seq, score))
                continue

            # Unsqueeze the sequence to add batch dimension
            input_ids = seq.unsqueeze(0)
            # Generate the target mask
            tgt_mask = nn.Transformer.generate_square_subsequent_mask(input_ids.size(1)).to(device)

            with torch.no_grad():
                # Forward pass through the model
                logits = model(keypoints.unsqueeze(0), emotions.unsqueeze(0), input_ids, tgt_mask=tgt_mask)

            # Get the logits for the last token
            next_token_logits = logits[0, -1, :]
            # Apply softmax to get probabilities
            probs = torch.softmax(next_token_logits, dim=-1)
            # Get the top k tokens and their probabilities
            top_probs, top_indices = probs.topk(beam_width)

            # Iterate through the beam width
            for i in range(beam_width):
                # Create a candidate sequence
                candidate = torch.cat([seq, top_indices[i].unsqueeze(0)])
                # Append the candidate and its score to the list
                all_candidates.append((candidate, score - torch.log(top_probs[i] + 1e-12)))

        # Sort the candidates by score
        ordered = sorted(all_candidates, key=lambda x: x[1])
        # Keep only the top k candidates
        generated = ordered[:beam_width]

    # Get the best candidate
    return tokenizer.decode(generated[0][0], skip_special_tokens=True)

In [None]:
# Function to train the model
def train_model_emotion(model, train_loader, val_loader, optimizer, criterion, scheduler, device, tokenizer, num_epochs=500, eos_token_id=102, pad_token_id=0):
    # Initialize the values used in training
    best_val_loss = float('inf')
    train_losses, test_losses = [], []
    train_accuracies, test_accuracies = [], []
    scaler = GradScaler()
    # Set patience for early stopping
    patience, patience_counter = 15, 0

    # Iterate through the epochs
    for epoch in range(num_epochs):
        # Set the model to training mode
        model.train()
        # Initialize the values for training
        train_loss, train_correct, train_total = 0, 0, 0
        # Define the scaler for mixed precision training
        scaler = GradScaler()

        # Iterate through the training data
        for keypoints, emotions, targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]"):
            # Move the data to the device
            keypoints, emotions, targets = keypoints.to(device), emotions.to(device), targets.to(device)
            # Create the decoder input and target sequences
            decoder_input = targets[:, :-1]
            decoder_target = targets[:, 1:]
            # Create the target mask
            tgt_mask = nn.Transformer.generate_square_subsequent_mask(decoder_input.size(1)).to(device)

            # Zero the gradients
            optimizer.zero_grad()
            # Forward pass through the model
            with autocast():
                logits = model(keypoints, emotions, decoder_input, tgt_mask=tgt_mask)
                # Compute the loss
                loss = criterion(logits.view(-1, logits.size(-1)), decoder_target.reshape(-1))

            # Backward pass and optimization
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            # Compute the accuracy
            preds = logits.argmax(dim=-1)
            correct = ((preds == decoder_target) & (decoder_target != pad_token_id) & (decoder_target != eos_token_id)).sum().item()
            total = (decoder_target != pad_token_id).sum().item()

            # Update the training values
            train_correct += correct
            train_total += total
            train_loss += loss.item()

        # Validation
        # Set the model to evaluation mode
        model.eval()
        # Initialize the values for validation
        val_loss, val_correct, val_total = 0, 0, 0
        with torch.no_grad():
            # Iterate through the validation data
            for keypoints, emotions, targets in tqdm(val_loader, desc="Validating"):
                # Move the data to the device
                keypoints, emotions, targets = keypoints.to(device), emotions.to(device), targets.to(device)
                # Create the decoder input and target sequences
                decoder_input = targets[:, :-1]
                decoder_target = targets[:, 1:]
                # Create the target mask
                tgt_mask = nn.Transformer.generate_square_subsequent_mask(decoder_input.size(1)).to(device)

                with autocast():
                    # Forward pass through the model
                    logits = model(keypoints, emotions, decoder_input, tgt_mask=tgt_mask)
                    # Compute the loss
                    loss = criterion(logits.view(-1, logits.size(-1)), decoder_target.reshape(-1))

                # Compute the accuracy
                preds = logits.argmax(dim=-1)
                correct = ((preds == decoder_target) & (decoder_target != pad_token_id)).sum().item()
                total = (decoder_target != pad_token_id).sum().item()

                # Update the validation values
                val_correct += correct
                val_total += total
                val_loss += loss.item()

        # Compute the average loss and accuracy
        train_acc = train_correct / train_total * 100
        val_acc = val_correct / val_total * 100
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)

        # Append the losses and accuracies to the lists
        train_losses.append(train_loss)
        train_accuracies.append(train_acc)
        test_losses.append(val_loss)
        test_accuracies.append(val_acc)

        # Print the summary for the epoch
        print(f"Epoch {epoch+1} Summary:\nTrain Loss: {train_loss:.4f}, Acc: {train_acc:.2f}% | Val Loss: {val_loss:.4f}, Acc: {val_acc:.2f}%")

        # Print the sample decoded text
        sample_kps, sample_emo, _ = next(iter(val_loader))
        print("Sample decoded:", sample_decode_beam(model, sample_kps[0].to(device), sample_emo[0].to(device), tokenizer))

        # Check for early stopping
        if val_loss < best_val_loss:
            # Save the best model
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), "best_emotion_model_new.pt")
        else:
            # Increment the patience counter
            patience_counter += 1
            print(f"No improvement in validation loss. {patience_counter} / {patience}")
            # Check if patience is exceeded
            if patience_counter >= patience:
                # Stop training
                print("Early stopping triggered.")
                break

    # Graph the training and validation losses and accuracies
    epochs_range = range(1, len(train_losses) + 1)
    plt.figure(figsize=(12, 5))

    # Accuracy Plot
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, train_accuracies, label='Train Accuracy')
    plt.plot(epochs_range, test_accuracies, label='Validation Accuracy')
    plt.title('Accuracy Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.legend()
    plt.grid(True)

    # Loss Plot
    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, train_losses, label='Train Loss')
    plt.plot(epochs_range, test_losses, label='Validation Loss')
    plt.title('Loss Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

In [None]:
# Define the criterion for loss calculation
criterion = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0)

# Define the optimizer for the model
optimizer = torch.optim.AdamW(emotion_model.parameters(), lr=5e-5, weight_decay=0.01)

# Define the learning rate scheduler
total_steps = len(emotion_train_loader) * 500
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(total_steps * 0.05),
    num_training_steps=total_steps
)


In [None]:
# Clear the cache and collect garbage
import gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
# Train the model
train_model_emotion(emotion_model, emotion_train_loader, emotions_test_loader, optimizer, criterion, scheduler, device, tokenizer)

Example output:
Epoch 24/500 [Train]: 100%|██████████| 36/36 [00:02<00:00, 15.41it/s]
Validating: 100%|██████████| 8/8 [00:00<00:00, 37.90it/s]
Epoch 24 Summary:
Train Loss: 5.1715, Acc: 14.17% | Val Loss: 5.7470, Acc: 20.38%
Sample decoded: it ' s.
Epoch 25/500 [Train]: 100%|██████████| 36/36 [00:02<00:00, 15.26it/s]
Validating: 100%|██████████| 8/8 [00:00<00:00, 37.88it/s]
Epoch 25 Summary:
Train Loss: 5.1139, Acc: 14.86% | Val Loss: 5.7463, Acc: 21.83%
Sample decoded: the ' s the ' s the ' s the ' s the ' s, i ' s, i ' s the ' s the ' s, i ' s the ' s the ' s the ' s, i ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s ' s ' s the ' s ' s ' s ' s ' s the ' s the ' s the ' s the ' s the ' s the ' s ' s ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the, the, i ' s the ' s the ' s the ' s the ' s the ' s the, i ' s the ' s the ' s i ' s the ' s the, i ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the ' s the,
Epoch 26/500 [Train]: 100%|██████████| 36/36 [00:02<00:00, 15.38it/s]
Validating: 100%|██████████| 8/8 [00:00<00:00, 39.33it/s]
Epoch 26 Summary:
Train Loss: 5.0595, Acc: 15.62% | Val Loss: 5.7328, Acc: 22.00%
Sample decoded: it ' s the ' s the ' s the ' s the ' s the.
Epoch 27/500 [Train]: 100%|██████████| 36/36 [00:02<00:00, 15.70it/s]
Validating: 100%|██████████| 8/8 [00:00<00:00, 37.97it/s]
Epoch 27 Summary:
Train Loss: 5.0064, Acc: 16.05% | Val Loss: 5.6538, Acc: 21.63%
Sample decoded: it ' s.