In [1]:
import math
import os
import time
import random
import copy

from torchvision.datasets import CocoCaptions
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingLR, CosineAnnealingWarmRestarts
import torch.nn.init as init
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

In [2]:
image_transform = Compose([
    Resize((224, 224)),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406],
              std=[0.229, 0.224, 0.225])
])

In [3]:
class CaptionPreprocessor:
    def __init__(self, captions, tokenizer, max_caption_length=20):
        self.tokenizer = tokenizer
        self.max_caption_length = max_caption_length
        self.captions_tokenized = self.tokenize_captions(captions)

    def preprocess(self, caption):
        tokens = self.tokenizer.tokenize(caption)
        caption_indices = self.tokenizer.convert_tokens_to_ids(tokens)

        if len(caption_indices) < self.max_caption_length:
            caption_indices += [self.tokenizer.pad_token_id] * (self.max_caption_length - len(caption_indices))

        return caption_indices[:self.max_caption_length]

    def tokenize_captions(self, captions):
        return [self.preprocess(caption) for caption in captions]

In [4]:
class CustomCocoDataset(Dataset):
    def __init__(self, coco_dataset, caption_preprocessor, num_captions=5):
        self.coco_dataset = coco_dataset
        self.caption_preprocessor = caption_preprocessor
        self.num_captions = num_captions

    def __len__(self):
        return len(self.coco_dataset)

    def __getitem__(self, idx):
        img, caption_list = self.coco_dataset[idx]
        selected_caption = random.choice(caption_list[:self.num_captions])
        preprocessed_caption = torch.tensor(self.caption_preprocessor.preprocess(selected_caption))
        return img, preprocessed_caption

In [5]:
class PatchEmbedding(nn.Module):
    def __init__(self, patch_size, in_channels, embed_dim):
        super().__init__()
        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
        init.xavier_uniform_(self.proj.weight)

    def forward(self, x):
        x = self.proj(x)
        x = x.flatten(2).transpose(1, 2)
        return x


class VisionTransformer(nn.Module):
    def __init__(self, in_channels, patch_size, embed_dim, num_layers, num_heads, mlp_dim, num_classes):
        super().__init__()
        self.patch_embed = PatchEmbedding(patch_size, in_channels, embed_dim)
        self.positional_encoding = nn.Parameter(torch.randn(1, (224 // patch_size) * (224 // patch_size) + 1, embed_dim))

        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(embed_dim, num_heads, mlp_dim)
            for _ in range(num_layers)
        ])

        self.classification_head = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.patch_embed(x)
        x = x + self.positional_encoding[:, :-1]
        for layer in self.transformer_layers:
            x = layer(x)

        return x

In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        self.encoding = nn.Parameter(torch.zeros(1, max_len, d_model), requires_grad=False)

        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        self.encoding[:, :, 0::2] = torch.sin(pos * div_term)
        self.encoding[:, :, 1::2] = torch.cos(pos * div_term)

    def forward(self, x):
        x = x + self.encoding[:, :x.size(1), :]
        return x


class TransformerCaptionDecoder(nn.Module):
    def __init__(self, auto_model, d_model, num_layers, num_heads, mlp_dim, max_len=128):
        super().__init__()

        self.auto_model = auto_model
        self.positional_encoding = PositionalEncoding(d_model, max_len)
        self.transformer_layers = nn.ModuleList([
            nn.TransformerDecoderLayer(d_model, num_heads, mlp_dim)
            for _ in range(num_layers)
        ])
        self.output_layer = nn.Linear(d_model, self.auto_model.config.vocab_size)
        init.xavier_uniform_(self.output_layer.weight)

    def forward(self, captions, memory):
        captions = self.auto_model.embeddings(captions)
        captions = self.positional_encoding(captions)

        for layer in self.transformer_layers:
            captions = layer(captions, memory)

        logits = self.output_layer(captions)
        return logits

In [7]:
class ImageCaptioningModel(nn.Module):
    def __init__(self, image_encoder, caption_decoder):
        super(ImageCaptioningModel, self).__init__()
        self.image_encoder = image_encoder
        self.caption_decoder = caption_decoder
        self.start_token_index = caption_decoder.auto_model.config.bos_token_id or 0
        self.embedding_size = caption_decoder.auto_model.config.hidden_size
        self.image_feature_linear = nn.Linear(768, self.embedding_size)

    def forward(self, images, captions):
        image_features = self.image_encoder(images)
        num_patches = (224 // 16) * (224 // 16)
        # image_features_flattened = image_features.permute(1, 0, 2).reshape(-1, num_patches, self.embedding_size)

        start_token_tensor = torch.tensor([self.start_token_index], dtype=torch.long, device=images.device)
        start_token_embeddings = self.caption_decoder.auto_model.embeddings(start_token_tensor).repeat(image_features.shape[0], 1, 1) # getting start token embedding and repeating it for batch size
        image_features_summed = image_features.sum(dim=1).unsqueeze(1)
        image_features_summed = self.image_feature_linear(image_features_summed)
        memory = torch.cat([start_token_embeddings, image_features_summed], dim=1) # Concatenate the start token embeddings with the flattened image features

        memory = memory.transpose(0, 1)
        captions = captions.transpose(0, 1)

        output = self.caption_decoder(captions, memory)
        return output

In [8]:
class NoamScheduler:
    def __init__(self, optimizer, d_model, warmup_steps=4000):
        self.optimizer = optimizer
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.current_step = 0

    def step(self):
        self.current_step += 1
        lr = self.learning_rate()
        for param_group in self.optimizer.param_groups:
            if param_group['lr'] != lr:
                print(f"Learning rate changed: {param_group['lr']} -> {lr}")
            param_group['lr'] = lr

    def learning_rate(self):
        arg1 = self.current_step ** -0.5
        arg2 = min(self.current_step * self.warmup_steps ** -1.5, 1)
        return (self.d_model ** -0.5) * min(arg1, arg2)

In [9]:
def plot_and_save(train_losses, val_losses, learning_rates, max_min_loss_diffs):
    plt.style.use('classic')

    fig, ax = plt.subplots(figsize=(15, 6))
    ax.plot(train_losses, label='Train Loss')
    ax.plot(val_losses, label='Validation Loss')
    ax.set_xlabel('Epochs', fontsize=14)
    ax.set_ylabel('Loss', fontsize=14)
    ax.set_title('Training and Validation Losses', fontsize=16)
    ax.tick_params(axis='both', which='major', labelsize=12)
    ax.grid()
    ax.legend(fontsize=12)
    fig.savefig('losses.png')

    fig, ax = plt.subplots(figsize=(15, 6))
    ax.plot(learning_rates, label='Learning Rate')
    ax.set_xlabel('Epochs', fontsize=14)
    ax.set_ylabel('Learning Rate', fontsize=14)
    ax.set_title('Learning Rate Schedule', fontsize=16)
    ax.tick_params(axis='both', which='major', labelsize=12)
    ax.grid()
    ax.legend(fontsize=12)
    fig.savefig('learning_rates.png')

    fig_ax = plt.subplots(figsize=(15, 6))
    ax.plot(max_min_loss_diffs, label='Loss Difference')
    ax.set_xlabel('Epochs', fontsize=14)
    ax.set_ylabel('Loss Difference', fontsize=14)
    ax.set_title('Difference Between Max and Min Loss per Epoch', fontsize=16)
    ax.tick_params(axis='both', which='major', labelsize=12)
    ax.grid()
    ax.legend(fontsize=12)
    fig.savefig('loss_differences.png')


In [10]:
tokenizer_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

train_dataset = CocoCaptions(root='./coco/images',
                       annFile='./coco/annotations/captions_train2014.json',
                       transform=image_transform)
val_dataset = CocoCaptions(root='./coco/images',
                           annFile='./coco/annotations/captions_val2014.json',
                           transform=image_transform)
train_captions = [entry['caption'] for entry in train_dataset.coco.anns.values()]
val_captions = [entry['caption'] for entry in val_dataset.coco.anns.values()]

caption_preprocessor = CaptionPreprocessor(train_captions + val_captions, tokenizer)

max_caption_length_train = max([len(tokenized_caption) for tokenized_caption in caption_preprocessor.tokenize_captions(train_captions)])
max_caption_length_val = max([len(tokenized_caption) for tokenized_caption in caption_preprocessor.tokenize_captions(val_captions)])
max_caption_length = max(max_caption_length_train, max_caption_length_val)
print('Maximum caption length (without <start>, <end>, and <pad> tokens):', max_caption_length)

custom_train_dataset = CustomCocoDataset(train_dataset, caption_preprocessor, num_captions=5)
custom_val_dataset = CustomCocoDataset(val_dataset, caption_preprocessor, num_captions=5)

batch_size = 64
train_data_loader = DataLoader(custom_train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)
val_data_loader = DataLoader(custom_val_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)

loading annotations into memory...
Done (t=1.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.44s)
creating index...
index created!
Maximum caption length (without <start>, <end>, and <pad> tokens): 20


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

image_encoder = VisionTransformer(in_channels=3,
                                  patch_size=16,
                                  embed_dim=768,
                                  num_layers=16,
                                  num_heads=16,
                                  mlp_dim=1024,
                                  num_classes=768).to(device)

auto_model = AutoModel.from_pretrained(tokenizer_name).to(device)
caption_decoder = TransformerCaptionDecoder(auto_model=auto_model,
                                            d_model=768,
                                            num_layers=16,
                                            num_heads=16,
                                            mlp_dim=1024).to(device)

model = ImageCaptioningModel(image_encoder, caption_decoder).to(device)

useTwoGPUs = True
if torch.cuda.device_count() > 1 and useTwoGPUs:
    print(f'Using {torch.cuda.device_count()} GPUs')
    model = nn.DataParallel(model)

num_epochs = 300

total_samples = len(train_data_loader.dataset)
batch_size = train_data_loader.batch_size
max_iterations = math.ceil(total_samples / batch_size)

criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.9, patience=2, verbose=True)
# scheduler = NoamScheduler(optimizer, d_model=1600, warmup_steps=4000)
# scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=1e-6)
# scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=int(num_epochs / 5), eta_min=1e-6)

best_val_loss = float('inf')

train_losses = []
val_losses = []
learning_rates = []
max_min_loss_diffs = []
save_name = ''

def train_one_epoch(model, dataloader, criterion, optimizer, device, epoch, avg_every):
    model.train()
    train_loss = 0
    last_x_losses = []
    for i, (images, captions) in enumerate(tqdm(dataloader, desc='Training')):
    # for i, (images, captions) in enumerate(dataloader):
        images = images.to(device)
        captions_input = captions[:, :-1].to(device)
        captions_target = captions[:, 1:].to(device)

        optimizer.zero_grad()
        output = model(images, captions_input)

        loss = criterion(output.reshape(-1, 30522), captions_target.view(-1))
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        last_x_losses.append(loss.item())

        if i % avg_every == 0:
            avg_loss = sum(last_x_losses) / len(last_x_losses)
            print(f'Epoch: {epoch+1}, Iteration: {i}, Loss (last {avg_every} iterations: {avg_loss}')
    return train_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for images, captions in tqdm(dataloader, desc='Validating'):
        # for images, captions in dataloader:
            image = images.to(device)
            captions_input = captions[:, :-1].to(device)
            captions_target = captions[:, 1:].to(device)

            output = model(images, captions_input)
            loss = criterion(output.reshape(-1, 30522), captions_target.view(-1))

            val_loss += loss.item()
    return val_loss / len(dataloader)


print('**********STARTING TRAINING**********')
training_start = time.time()
for epoch in range(num_epochs):
    epoch_start = time.time()

    epoch_max_loss = float('-inf')
    epoch_min_loss = float('inf')

    print(f'Total samples: {total_samples}, Batch size: {batch_size}, Maximum iterations: {max_iterations}')

    avg_every = 25
    train_loss = train_one_epoch(model, train_data_loader, criterion, optimizer, device, epoch, avg_every)
    val_loss = evaluate(model, val_data_loader, criterion, device)

    epoch_end = time.time()
    print(f'Epoch {epoch+1} total time: {epoch_end - epoch_start}')

    train_losses.append(train_loss)
    val_losses.append(val_loss)

    if val_loss < best_val_loss:
        best_val_loss = val_loss

        save_name = f'best_loss_model_{epoch}.pt'
        torch.save(model.state_dict(), save_name)

    scheduler.step(val_loss)

training_end = time.time()
print(f'Total training time: {training_end - training_start}')

plot_and_save(train_losses, val_losses, learning_rates, max_min_loss_diffs)


cuda


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable

Using 2 GPUs
**********STARTING TRAINING**********
Total samples: 82783, Batch size: 64, Maximum iterations: 1294


Training:   0%|          | 0/1293 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Training:   0%|          | 1/1293 [00:08<3:06:05,  8.64s/it]

Epoch: 1, Iteration: 0, Loss (last 25 iterations: 10.325166702270508


Training:   2%|▏         | 26/1293 [00:29<16:31,  1.28it/s] 

Epoch: 1, Iteration: 25, Loss (last 25 iterations: 6.803201216917771


Training:   4%|▍         | 51/1293 [00:48<16:30,  1.25it/s]

Epoch: 1, Iteration: 50, Loss (last 25 iterations: 6.416073976778517


Training:   6%|▌         | 76/1293 [01:08<16:30,  1.23it/s]

Epoch: 1, Iteration: 75, Loss (last 25 iterations: 6.273479599701731


Training:   8%|▊         | 101/1293 [01:28<15:47,  1.26it/s]

Epoch: 1, Iteration: 100, Loss (last 25 iterations: 6.190937552121606


Training:  10%|▉         | 126/1293 [01:48<15:28,  1.26it/s]

Epoch: 1, Iteration: 125, Loss (last 25 iterations: 6.145129888776749


Training:  12%|█▏        | 151/1293 [02:08<15:09,  1.26it/s]

Epoch: 1, Iteration: 150, Loss (last 25 iterations: 6.111671078284055


Training:  14%|█▎        | 176/1293 [02:28<14:48,  1.26it/s]

Epoch: 1, Iteration: 175, Loss (last 25 iterations: 6.079453842206434


Training:  16%|█▌        | 201/1293 [02:48<14:55,  1.22it/s]

Epoch: 1, Iteration: 200, Loss (last 25 iterations: 6.0606396233857565


Training:  17%|█▋        | 226/1293 [03:08<14:10,  1.25it/s]

Epoch: 1, Iteration: 225, Loss (last 25 iterations: 6.046436328803543


Training:  19%|█▉        | 251/1293 [03:28<13:50,  1.25it/s]

Epoch: 1, Iteration: 250, Loss (last 25 iterations: 6.029408445396271


Training:  21%|██▏       | 276/1293 [03:49<13:31,  1.25it/s]

Epoch: 1, Iteration: 275, Loss (last 25 iterations: 6.015607823496279


Training:  23%|██▎       | 301/1293 [04:09<13:12,  1.25it/s]

Epoch: 1, Iteration: 300, Loss (last 25 iterations: 6.006954096480461


Training:  25%|██▌       | 326/1293 [04:29<13:04,  1.23it/s]

Epoch: 1, Iteration: 325, Loss (last 25 iterations: 5.996802621092533


Training:  27%|██▋       | 351/1293 [04:49<12:33,  1.25it/s]

Epoch: 1, Iteration: 350, Loss (last 25 iterations: 5.989074253288769


Training:  29%|██▉       | 376/1293 [05:09<12:11,  1.25it/s]

Epoch: 1, Iteration: 375, Loss (last 25 iterations: 5.983686061615639


Training:  31%|███       | 401/1293 [05:29<11:58,  1.24it/s]

Epoch: 1, Iteration: 400, Loss (last 25 iterations: 5.97716648620263


Training:  33%|███▎      | 426/1293 [05:49<11:32,  1.25it/s]

Epoch: 1, Iteration: 425, Loss (last 25 iterations: 5.97098171990802


Training:  35%|███▍      | 451/1293 [06:10<11:23,  1.23it/s]

Epoch: 1, Iteration: 450, Loss (last 25 iterations: 5.964980954341508


Training:  37%|███▋      | 476/1293 [06:29<10:52,  1.25it/s]

Epoch: 1, Iteration: 475, Loss (last 25 iterations: 5.962292891590535


Training:  39%|███▊      | 501/1293 [06:49<10:31,  1.25it/s]

Epoch: 1, Iteration: 500, Loss (last 25 iterations: 5.956690281926991


Training:  41%|████      | 526/1293 [07:10<10:11,  1.25it/s]

Epoch: 1, Iteration: 525, Loss (last 25 iterations: 5.951172574844651


Training:  43%|████▎     | 551/1293 [07:30<09:54,  1.25it/s]

Epoch: 1, Iteration: 550, Loss (last 25 iterations: 5.946928570361406


Training:  45%|████▍     | 576/1293 [07:50<09:38,  1.24it/s]

Epoch: 1, Iteration: 575, Loss (last 25 iterations: 5.941401449342568


Training:  46%|████▋     | 601/1293 [08:10<09:12,  1.25it/s]

Epoch: 1, Iteration: 600, Loss (last 25 iterations: 5.9383809364179205


Training:  48%|████▊     | 626/1293 [08:30<08:53,  1.25it/s]

Epoch: 1, Iteration: 625, Loss (last 25 iterations: 5.934746167149407


Training:  50%|█████     | 651/1293 [08:50<08:36,  1.24it/s]

Epoch: 1, Iteration: 650, Loss (last 25 iterations: 5.931481723961193


Training:  52%|█████▏    | 676/1293 [09:10<08:12,  1.25it/s]

Epoch: 1, Iteration: 675, Loss (last 25 iterations: 5.9284918117805345


Training:  54%|█████▍    | 701/1293 [09:31<07:57,  1.24it/s]

Epoch: 1, Iteration: 700, Loss (last 25 iterations: 5.9253848507128835


Training:  56%|█████▌    | 726/1293 [09:51<07:33,  1.25it/s]

Epoch: 1, Iteration: 725, Loss (last 25 iterations: 5.923208065926207


Training:  58%|█████▊    | 751/1293 [10:11<07:13,  1.25it/s]

Epoch: 1, Iteration: 750, Loss (last 25 iterations: 5.919688994335271


Training:  60%|██████    | 776/1293 [10:31<06:53,  1.25it/s]

Epoch: 1, Iteration: 775, Loss (last 25 iterations: 5.915811761752846


Training:  62%|██████▏   | 801/1293 [10:51<06:33,  1.25it/s]

Epoch: 1, Iteration: 800, Loss (last 25 iterations: 5.913299864151058


Training:  64%|██████▍   | 826/1293 [11:11<06:17,  1.24it/s]

Epoch: 1, Iteration: 825, Loss (last 25 iterations: 5.910734591126153


Training:  66%|██████▌   | 851/1293 [11:31<05:52,  1.26it/s]

Epoch: 1, Iteration: 850, Loss (last 25 iterations: 5.908861529532947


Training:  68%|██████▊   | 876/1293 [11:51<05:33,  1.25it/s]

Epoch: 1, Iteration: 875, Loss (last 25 iterations: 5.907161130752738


Training:  70%|██████▉   | 901/1293 [12:12<05:14,  1.25it/s]

Epoch: 1, Iteration: 900, Loss (last 25 iterations: 5.905238712535185


Training:  72%|███████▏  | 926/1293 [12:32<04:52,  1.25it/s]

Epoch: 1, Iteration: 925, Loss (last 25 iterations: 5.903741049200104


Training:  74%|███████▎  | 951/1293 [12:52<04:35,  1.24it/s]

Epoch: 1, Iteration: 950, Loss (last 25 iterations: 5.902740184190271


Training:  75%|███████▌  | 976/1293 [13:12<04:12,  1.25it/s]

Epoch: 1, Iteration: 975, Loss (last 25 iterations: 5.901541073302754


Training:  77%|███████▋  | 1001/1293 [13:32<03:52,  1.25it/s]

Epoch: 1, Iteration: 1000, Loss (last 25 iterations: 5.900335427645322


Training:  79%|███████▉  | 1026/1293 [13:52<03:33,  1.25it/s]

Epoch: 1, Iteration: 1025, Loss (last 25 iterations: 5.898841606478477


Training:  81%|████████▏ | 1051/1293 [14:12<03:13,  1.25it/s]

Epoch: 1, Iteration: 1050, Loss (last 25 iterations: 5.897455783938363


Training:  83%|████████▎ | 1076/1293 [14:32<02:54,  1.24it/s]

Epoch: 1, Iteration: 1075, Loss (last 25 iterations: 5.8961613470736935


Training:  85%|████████▌ | 1101/1293 [14:53<02:33,  1.25it/s]

Epoch: 1, Iteration: 1100, Loss (last 25 iterations: 5.895112850577262


Training:  87%|████████▋ | 1126/1293 [15:13<02:13,  1.25it/s]

Epoch: 1, Iteration: 1125, Loss (last 25 iterations: 5.8935130735061945


Training:  89%|████████▉ | 1151/1293 [15:33<01:53,  1.25it/s]

Epoch: 1, Iteration: 1150, Loss (last 25 iterations: 5.892392571546221


Training:  91%|█████████ | 1176/1293 [15:53<01:33,  1.25it/s]

Epoch: 1, Iteration: 1175, Loss (last 25 iterations: 5.892017703478028


Training:  93%|█████████▎| 1201/1293 [16:13<01:13,  1.25it/s]

Epoch: 1, Iteration: 1200, Loss (last 25 iterations: 5.890774253206785


Training:  95%|█████████▍| 1226/1293 [16:33<00:53,  1.25it/s]

Epoch: 1, Iteration: 1225, Loss (last 25 iterations: 5.8901653394808


Training:  97%|█████████▋| 1251/1293 [16:53<00:33,  1.25it/s]

Epoch: 1, Iteration: 1250, Loss (last 25 iterations: 5.889544274309556


Training:  99%|█████████▊| 1276/1293 [17:13<00:13,  1.25it/s]

Epoch: 1, Iteration: 1275, Loss (last 25 iterations: 5.888377959078008


Training: 100%|██████████| 1293/1293 [17:27<00:00,  1.26it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Training: 100%|██████████| 1293/1293 [17:27<00:00,  1.23it/s]
Validating:   0%|          | 0/632 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validating: 100%|██████████| 632/632 [03:14<00:00,  3.25it/s]


Epoch 1 total time: 1242.019146680832
Total samples: 82783, Batch size: 64, Maximum iterations: 1294


Training:   0%|          | 0/1293 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Training:   0%|          | 1/1293 [00:02<50:19,  2.34s/it]

Epoch: 2, Iteration: 0, Loss (last 25 iterations: 5.904211044311523


Training:   2%|▏         | 26/1293 [00:22<16:52,  1.25it/s]

Epoch: 2, Iteration: 25, Loss (last 25 iterations: 5.846445028598492


Training:   4%|▍         | 51/1293 [00:42<16:30,  1.25it/s]

Epoch: 2, Iteration: 50, Loss (last 25 iterations: 5.841522001752667


Training:   6%|▌         | 76/1293 [01:02<16:10,  1.25it/s]

Epoch: 2, Iteration: 75, Loss (last 25 iterations: 5.843715812030592


Training:   8%|▊         | 101/1293 [01:22<15:52,  1.25it/s]

Epoch: 2, Iteration: 100, Loss (last 25 iterations: 5.834309913144253


Training:  10%|▉         | 126/1293 [01:43<15:43,  1.24it/s]

Epoch: 2, Iteration: 125, Loss (last 25 iterations: 5.834685473215012


Training:  12%|█▏        | 151/1293 [02:03<15:15,  1.25it/s]

Epoch: 2, Iteration: 150, Loss (last 25 iterations: 5.834682133024102


Training:  14%|█▎        | 176/1293 [02:23<14:54,  1.25it/s]

Epoch: 2, Iteration: 175, Loss (last 25 iterations: 5.837395044890317


Training:  16%|█▌        | 201/1293 [02:43<14:39,  1.24it/s]

Epoch: 2, Iteration: 200, Loss (last 25 iterations: 5.839643736976889


Training:  17%|█▋        | 226/1293 [03:03<14:14,  1.25it/s]

Epoch: 2, Iteration: 225, Loss (last 25 iterations: 5.83822950219686


Training:  19%|█▉        | 251/1293 [03:23<14:01,  1.24it/s]

Epoch: 2, Iteration: 250, Loss (last 25 iterations: 5.835040907460855


Training:  21%|██▏       | 276/1293 [03:43<13:33,  1.25it/s]

Epoch: 2, Iteration: 275, Loss (last 25 iterations: 5.833148337792659


Training:  23%|██▎       | 301/1293 [04:03<13:10,  1.25it/s]

Epoch: 2, Iteration: 300, Loss (last 25 iterations: 5.833812957586244


Training:  25%|██▌       | 326/1293 [04:24<12:50,  1.26it/s]

Epoch: 2, Iteration: 325, Loss (last 25 iterations: 5.835760985415406


Training:  27%|██▋       | 351/1293 [04:44<12:34,  1.25it/s]

Epoch: 2, Iteration: 350, Loss (last 25 iterations: 5.838027066994257


Training:  29%|██▉       | 376/1293 [05:04<12:17,  1.24it/s]

Epoch: 2, Iteration: 375, Loss (last 25 iterations: 5.838830443138772


Training:  31%|███       | 401/1293 [05:24<11:53,  1.25it/s]

Epoch: 2, Iteration: 400, Loss (last 25 iterations: 5.83895943230227


Training:  33%|███▎      | 426/1293 [05:44<11:33,  1.25it/s]

Epoch: 2, Iteration: 425, Loss (last 25 iterations: 5.840618992075674


Training:  35%|███▍      | 451/1293 [06:04<11:12,  1.25it/s]

Epoch: 2, Iteration: 450, Loss (last 25 iterations: 5.837665506054186


Training:  37%|███▋      | 476/1293 [06:24<10:52,  1.25it/s]

Epoch: 2, Iteration: 475, Loss (last 25 iterations: 5.836724000818589


Training:  37%|███▋      | 483/1293 [06:31<10:56,  1.23it/s]


KeyboardInterrupt: 

In [None]:

    # epoch_train_start = time.time()
    # for i, (images, captions) in enumerate(train_data_loader):
    #     images = images.to(device)
    #     captions_input = captions[:, :-1].to(device)
    #     captions_target = captions[:, 1:].to(device)
    #
    #     optimizer.zero_grad()
    #     output = model(images, captions_input)
    #
    #     loss = criterion(output.reshape(-1, 28796), captions_target.view(-1))
    #     loss.backward()
    #     optimizer.step()
    #
    #     train_loss += loss.item()
    #
    #     if loss.item() > epoch_max_loss:
    #         epoch_max_loss = loss.item()
    #         print(f'Max loss set to: {epoch_max_loss}')
    #     if loss.item() < epoch_min_loss:
    #         epoch_min_loss = loss.item()
    #         print(f'Min loss set to: {epoch_min_loss}')
    #
    #     if i % 50 == 0:
    #         print(f'Epoch: {epoch+1}/{num_epochs}, Iteration: {i}, Loss: {loss.item()}')
    #
    # epoch_train_end = time.time()
    # epoch_train_time = epoch_train_end - epoch_train_start
    # print(f'Epoch {epoch+1} training time: {epoch_train_time}')
    #
    # epoch_max_min_diff = epoch_max_loss - epoch_min_loss
    # if epoch + 1 != 1:
    #     max_min_loss_diffs.append(epoch_max_min_diff)
    # print(f'Difference between max and min loss in epoch {epoch+1}: {epoch_max_min_diff}')
    #
    # train_loss /= len(train_data_loader)
    #
    # model.eval()
    # val_loss = 0

#     epoch_val_start = time.time()
#     with torch.no_grad():
#         for images, captions in val_data_loader:
#             images = images.to(device)
#             captions_input = captions[:, :-1].to(device)
#             captions_target = captions[:, 1:].to(device)
#
#             output = model(images, captions_input)
#             loss = criterion(output.reshape(-1, 28796), captions_target.view(-1))
#
#             val_loss += loss.item()
#
#     epoch_val_end = time.time()
#     epoch_val_time = epoch_val_end - epoch_val_start
#     print(f'Epoch {epoch+1} validation time: {epoch_val_time}')
#
#     epoch_end = time.time()
#     epoch_time = epoch_end - epoch_start
#     print(f'Epoch {epoch+1} total time: {epoch_time}')
#
#     val_loss /= len(val_data_loader)
#     print(f'Epoch: {epoch+1}/{num_epochs}, Train Loss: {train_loss}, Val Loss: {val_loss}')
#     train_losses.append(train_loss)
#     val_losses.append(val_loss)
#     learning_rates.append(optimizer.param_groups[0]['lr'])
#
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#
#         if os.path.exists(save_name):
#             os.remove(save_name)
#
#         save_name = f'best_loss_model_{epoch}.pth'
#         torch.save(model.state_dict(), save_name)
#
#     scheduler.step()
#
# training_end = time.time()
# training_time = training_end - training_start
# print(f'Total training time: {training_time}')
#
# plot_and_save(train_losses, val_losses, learning_rates, max_min_loss_diffs)