In [1]:
import os
import json
import logging
import sys
import datetime
import time
import pandas as pd  # for lookup in annotation file
import spacy
import torch
import torch.nn as nn
import torchvision.models as models
import torch.optim as optim
import torchvision.transforms as T
from torch.nn.utils.rnn import pad_sequence  # pad batch
from torch.utils.data import DataLoader, Dataset
from PIL import Image  # Load img
from nltk.translate.bleu_score import corpus_bleu


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd  # for lookup in annotation file


In [2]:
import glob

def is_today(date_str):
    today = datetime.today().strftime('%Y-%m-%d')
    return date_str == today

def get_newest_folder_name(directory, pattern='*-*-*'):
    folders = glob.glob(os.path.join(directory, pattern), recursive=True)
    
    newest_folder = max(folders, key=os.path.getmtime)
    return newest_folder

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)
NUM_WORKERS = 0

INPUT_SIZE = (384, 384)
EMBED_SIZE = 256
HIDDEN_SIZE = 256
BATCH_SIZE = 16
BASE_LR = 0.1
MAX_LR = 1.0
MAX_EPOCHS = 200
SAVE_INTERVAL = 10

EARLY_STOP_THRESHOLD = 20
N_SPLITS = 2

CHECKPOINT = None


cuda


In [4]:
root_dir = f'{os.getcwd()}/data'

FOLDER_PATH = get_newest_folder_name(root_dir)

IMAGES_DIR = os.path.join(
    FOLDER_PATH,
    'images'
)

CAPTIONS_DIR = os.path.join(
    FOLDER_PATH,
    'captions.csv'
)

OUTPUT_DIR = f'{os.path.dirname(os.getcwd())}/models'

print(IMAGES_DIR)
print(CAPTIONS_DIR)
print(OUTPUT_DIR)

/home/jiggle/personal/mle/project/project-1/notebooks/data/2024-02-06/images
/home/jiggle/personal/mle/project/project-1/notebooks/data/2024-02-06/captions.csv
/home/jiggle/personal/mle/project/project-1/models


## Utils

### Free GPU Utils

In [5]:
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
from numba import cuda
import gc
import torch

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:1024"

def free_gpu_cache(*args):
    gc.collect()
    for param in args:
        del param
        
def free_cuda_cache():
    if torch.cuda.is_available():
        gc.collect()
        torch.cuda.empty_cache()
        
#         cuda.select_device(0)
#         cuda.close()
#         cuda.select_device(0)


def wait_until_enough_gpu_memory(min_memory_available=2 * 1024 * 1024 * 1024, max_retries=10, sleep_time=5): #2GB
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(torch.cuda.current_device())

    for _ in range(max_retries):
        free_cuda_cache()
        info = nvmlDeviceGetMemoryInfo(handle)
        if info.free >= min_memory_available:
            break
        print(f"Waiting for {min_memory_available} bytes of free GPU memory. Retrying in {sleep_time} seconds...")
        time.sleep(sleep_time)
    else:
        raise RuntimeError(f"Failed to acquire {min_memory_available} bytes of free GPU memory after {max_retries} retries.")


### Create Custom Logger

In [6]:
import logging

class LogCustomFormatter(logging.Formatter):
    grey = "\x1b[0;37m"
    green = "\x1b[1;32m"
    bold_red = "\x1b[31;1m"
    yellow = "\x1b[1;33m"
    red = "\x1b[1;31m"
    purple = "\x1b[1;35m"
    blue = "\x1b[1;34m"
    light_blue = "\x1b[1;36m"
    reset = "\x1b[0m"
    blink_red = "\x1b[5m\x1b[1;31m"

    format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)'

    FORMATS = {
        logging.DEBUG: grey + format + reset,
        logging.INFO: grey + format + reset,
        logging.WARNING: yellow + format + reset,
        logging.ERROR: red + format + reset,
        logging.CRITICAL: bold_red + format + reset
    }

    def format(self, record):
        log_fmt = self.FORMATS.get(record.levelno)
        formatter = logging.Formatter(log_fmt)
        return formatter.format(record)
    
def setup_logger(logger_name, output_dir = None):
    # Remove existing handlers if the logger already exists
    logger = logging.getLogger(logger_name)
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)

    logger.setLevel(logging.DEBUG)
    
    # create console handler with a higher log level
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    # create formatter and add it to the handlers
    formatter = LogCustomFormatter()
    ch.setFormatter(formatter)
    # add the handlers to logger
    logger.addHandler(ch)
    
    if output_dir:
        # create file handler which logs even debug messages
        fh = logging.FileHandler(os.path.join(output_dir, 'training_log.log'))
        fh.setLevel(logging.DEBUG)
        fh.setFormatter(formatter)
        logger.addHandler(fh)
    return logger

def logging_hyperparameters(logger):
    logger.info("==========Hyperparameters==========")
    logger.info(f"Device: {DEVICE}")
    # logger.info(f"Architecture: {ARCH}")
    # logger.info(f"Encoder: {ENCODER_NAME}")
    # logger.info(f"Encoder weight: imagenet")
    logger.info(f"Input size: {INPUT_SIZE}")
    logger.info(f"Batch size: {BATCH_SIZE}")
    logger.info(f"Base learning rate: {BASE_LR}")
    logger.info(f"Max epochs: {MAX_EPOCHS}")
    logger.info(f"Weight decay: {1e-5}")
    logger.info("===================================")


In [7]:
def init_path(model_name):
    weight_dir = os.path.join(OUTPUT_DIR, model_name)
    os.makedirs(weight_dir, exist_ok=True)
    log_dir = weight_dir
    logger_name = model_name

    return weight_dir, log_dir, logger_name

### Create Random Split function

In [8]:
from sklearn.model_selection import StratifiedKFold

def split_dataset(csv_path, n_splits = 2, vocab_freq_threshold = 5):
    df1 = pd.read_csv(csv_path, delimiter='|')
    folder = csv_path.split('/')[-2]
    df1['image'] = f'{folder}/' + df1['image']
    df2 = pd.read_csv(f'{root_dir}/flickr8k/captions.txt', delimiter=',')
    df2['image'] = 'flickr8k/images/' + df2['image']
    df = pd.concat([df1, df2], axis=0)
    
    # df = pd.read_csv(csv_path, delimiter='|')
    # folder = csv_path.split('/')[-2]
    # df['image'] = f'{folder}/' + df['image']
    
    df.reset_index(drop=True)

    X = df[['image', 'caption']]
    y = df['caption']
    
    vocab = Vocabulary(vocab_freq_threshold)
    vocab.build_vocabulary(y.tolist())
    
    # Initialize StratifiedKFold
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True)
    folds = []
    
    for train_idx, val_idx in stratified_kfold.split(X, y):
        train_set = df.iloc[train_idx]
        val_set = df.iloc[val_idx]
        folds.append((train_set, val_set))
    
    return folds, vocab
        

## Setup Data

### Build Vocabulary

In [9]:
class Vocabulary:
    def __init__(self, freq_threshold=5):
        self.spacy_eng = spacy.load("en_core_web_sm")
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold
    
    def __len__(self):
        return len(self.itos)
    
    def tokenizer_eng(self, text):
        return [tok.text.lower() for tok in self.spacy_eng.tokenizer(text)]

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4

        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1

                else:
                    frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1
        self._save_vocab_dict()
    
    def _save_vocab_dict(self):
        vocab_dict = {
            'itos': self.itos,
            'stoi': self.stoi,
            'freq_threshold': self.freq_threshold
        }

        with open(f'{OUTPUT_DIR}/vocab.json', 'w', encoding='utf8', newline='') as f:
            json.dump(vocab_dict, f)

    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]


### Create Custom Dataset

In [10]:
class SpaceDataset(Dataset):
    def __init__(self, root_dir, data_set, vocab, transform=None, freq_threshold=5):
        self.root_dir = root_dir
        self.transform = transform
        
        self.imgs = data_set['image']
        self.captions = data_set['caption']
        
        self.vocab = vocab
        
    def __len__(self):
        return len(self.imgs)
    
    def __getitem__(self, idx):
        caption = self.captions.iloc[idx]
        img_path = self.imgs.iloc[idx]

        img = Image.open(os.path.join(self.root_dir, img_path)).convert("RGB")
        
        if self.transform:
            img = self.transform(img)
        
        numericalized_caption = [self.vocab.stoi["<SOS>"]]
        numericalized_caption += self.vocab.numericalize(caption)
        numericalized_caption.append(self.vocab.stoi["<EOS>"])

        return img, torch.tensor(numericalized_caption)
        

In [11]:
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        imgs = [item[0].unsqueeze(0) for item in batch]
        imgs = torch.cat(imgs, dim=0)
        targets = [item[1] for item in batch]
        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)

        return imgs, targets

### Retrieve Loader from Dataset

In [12]:
def get_loader(
    root_folder,
    fold,
    vocab,
    transform,
    batch_size=32,
    num_workers=8,
    shuffle=True,
    pin_memory=True,
):
    train_images, val_images = fold
    train_set = SpaceDataset(root_folder, train_images, vocab, transform=transform)
    val_set = SpaceDataset(root_folder, val_images, vocab, transform=transform)

    pad_idx = vocab.stoi["<PAD>"]

    train_loader = DataLoader(
        dataset=train_set,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn=MyCollate(pad_idx=pad_idx),
    )
    
    val_loader = DataLoader(
        dataset=train_set,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=False,
        pin_memory=pin_memory,
        collate_fn=MyCollate(pad_idx=pad_idx),
    )
    
    return train_loader, val_loader

## Setup Models

In [13]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size, train_CNN = False):
        super(EncoderCNN, self).__init__()
        self.train_CNN = train_CNN
        self.inception = models.inception_v3(pretrained=True, aux_logits=True)
        self.inception.fc = nn.Linear(self.inception.fc.in_features, embed_size)
        self.relu = nn.ReLU()
        self.times = []
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, images):
        features = self.inception(images)
        
        return self.dropout(self.relu(features[0]))

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, self.hidden_size, self.num_layers)
        self.linear = nn.Linear(self.hidden_size, vocab_size)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, features, captions):
        features = features.unsqueeze(0)
  
        embeddings = self.dropout(self.embed(captions))

        embeddings = torch.cat((features, embeddings), dim=0)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs

class CNNtoRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers = 1):
        super(CNNtoRNN, self).__init__()
        self.encoderCNN = EncoderCNN(embed_size)
        self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)
    
    def forward(self, images, captions):
        features = self.encoderCNN(images)
        outputs = self.decoderRNN(features, captions)
        return outputs
    
    def caption_image(self, image, vocabulary, max_length=50):
        result_caption = []
        
        with torch.no_grad():
            x = self.encoderCNN(image).unsqueeze(0)
            states = None
            
            for _ in range(max_length):                
                hiddens, states = self.decoderRNN.lstm(x, states)
                output = self.decoderRNN.linear(hiddens.squeeze(0))

                predicted = output.argmax()
                
                result_caption.append(predicted.item())
                x = self.decoderRNN.embed(predicted).unsqueeze(0)
                
                if vocabulary.itos[predicted.item()] == '<EOS>':
                    break
                
        return [vocabulary.itos[idx] for idx in result_caption][1:-1]

## Setup Trainer

In [14]:
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [15]:
class Metrics:
    def __init__(self):
        self.global_step = 0
        self.train_loss = AverageMeter()
        self.val_loss = AverageMeter()
        
        self.bleu_score = 0
        self.references = []
        self.hypotheses = []
        
    def reset(self):
        self.train_loss.reset()
        self.val_loss.reset()
        self.bleu_score = 0
        self.references = []
        self.hypotheses = []


### Implement Train Helper

In [16]:
from tqdm import tqdm

class TrainHelper:
    def __init__(
        self,
        model: nn.Module,
        vocab: Vocabulary,
        train_loader: DataLoader,
        val_loader: DataLoader,
        optimizer: torch.optim,
        metrics: Metrics,
        # tensorboard_writer = None,
        current_epoch: int = 1,
        epochs: int = 1,
        current_fold: int = 1,
        folds: int = 1,
        loss_criteria: torch.nn = None
    ):
        self.model = model.to(DEVICE)
        self.vocab = vocab
        # self.tensorboard_writer = tensorboard_writer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.optimizer = optimizer
        self.metrics = metrics
        self.current_epoch = current_epoch
        self.epochs = epochs
        self.current_fold = current_fold
        self.folds = folds
        self.loss_criteria = loss_criteria
    
    def train(self):
        self.model.train()
        length = len(self.train_loader)
        self.metrics.reset()
        
        with tqdm(self.train_loader, total=length, unit="batch") as tepoch:
            for batch, (image, caption) in enumerate(tepoch):
                tepoch.set_description(f'Fold[{self.current_fold}/{self.folds}] Epoch[{self.current_epoch}/{self.epochs}] Train')
                
                n = image.shape[0]
                self.optimizer.zero_grad()
                
                image = image.to(DEVICE)
                caption = caption.to(DEVICE)
                
                outputs = self.model(image, caption[:-1])
                loss = self.loss_criteria(outputs.reshape(-1, outputs.shape[2]), caption.reshape(-1))
                
                # self.tensorboard_writer.add_scalar("Training loss", loss.item(), global_step = self.metrics.global_step)
                loss.backward()
                
                self.metrics.train_loss.update(loss.item(), n)
                
                self.metrics.global_step += 1
                
                self.optimizer.step()
                free_gpu_cache(image, caption)
                free_cuda_cache()

                string = self._train_log()
                tepoch.set_postfix_str(string)
        return self.metrics
    
    def eval(self):
        self.model.eval()
        length = len(self.val_loader)
        
        with torch.no_grad():
            with tqdm(self.val_loader, total=length, unit="batch") as tepoch:
                for (images, captions) in tepoch:
                    tepoch.set_description(f'Fold[{self.current_fold}/{self.folds}] Epoch[{self.current_epoch}/{self.epochs}] Eval')
                    n = images.shape[0]
                    
                    images = images.to(DEVICE)
                    captions = captions.to(DEVICE)                  
 
                    for image, caption in zip(images, captions):
                        # Generate caption for each image
                        predicted_caption = model.caption_image(image.unsqueeze(0), self.vocab)

                        # Convert reference to words
                        reference = [self.vocab.itos[idx.item()] for idx in caption if idx.item() != 0]
                        reference = [' '.join(reference)]
                        hypothese = ' '.join(predicted_caption)

                        self.metrics.references.append(reference)
                        self.metrics.hypotheses.append(hypothese)
                    
                    self.metrics.bleu_score = corpus_bleu(self.metrics.references, self.metrics.hypotheses)  
                        
                    free_gpu_cache(images, captions)
                    free_cuda_cache()
                    string = self._val_log()
                    tepoch.set_postfix_str(string)
        return self.metrics
    
    def _train_log(self):
        return f"Loss: {self.metrics.train_loss.avg:.3f}"

    def _val_log(self):
        return f"BLEU Score: {self.metrics.bleu_score:.3f}"
                
        

### Implement Trainer

In [17]:
class Trainer:
    def __init__(
        self,
        folds,
        vocab,
        epochs,
        model: nn.Module,
        optimizer: torch.optim,
        loss_criteria: torch.nn = None
    ):
        # torch.backends.cudnn.benchmark = True
        # for tensorboard
        self.folds = folds
        self.vocab = vocab
        self.epochs = epochs
        self.model = model
        self.optimizer = optimizer
        self.loss_criteria = loss_criteria
        
        self.weight_dir, log_dir, logger_name = init_path(model.__class__.__name__)
        self.logger = setup_logger(logger_name, log_dir)
        self.stale = 0
        self.best_overall = 0
        self.start_time = datetime.datetime.now()
    
    def run(self):
        logging_hyperparameters(self.logger)
        transform = T.Compose(
            [
                T.Resize(INPUT_SIZE),
                T.ToTensor(),
                T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
            ]
        )
        
        
        for idx, fold in enumerate(self.folds):
            wait_until_enough_gpu_memory()

            # train_loader, val_loader = get_loader(FOLDER_PATH, fold, self.vocab, transform, 
            #                                       batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=True, pin_memory=True)

            train_loader, val_loader = get_loader(root_dir, fold, self.vocab, transform, 
                                                  batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=True, pin_memory=True)


            self._loops(idx, train_loader, val_loader)
            
            free_gpu_cache(train_loader, val_loader)
    
    def _loops(self, idx, train_loader, val_loader):
        N_FOLDS = len(self.folds)
        metrics = Metrics()
        
        if CHECKPOINT is not None:
            if os.path.exists(CHECKPOINT):
                checkpoint = torch.load(CHECKPOINT)
                self.model.load_state_dict(checkpoint['model_state_dict'])
                self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
                start_epoch = checkpoint['epoch']
                self.best_overall = checkpoint['best_f1']
                self.logger.info(f"Resume training from epoch {start_epoch}")
            else:
                self.logger.info(f"Checkpoint not found, start training from epoch 1")
        
        scheduler = optim.lr_scheduler.CyclicLR(self.optimizer, base_lr=BASE_LR, max_lr=MAX_LR, cycle_momentum=False)
        
        for epoch in range(1, self.epochs + 1):
            trainer_helper = TrainHelper(self.model, self.vocab, train_loader, val_loader, self.optimizer, metrics, epoch, self.epochs, idx + 1, N_FOLDS, self.loss_criteria)

            trainer_helper.train()

            trainer_helper.eval()

            self._save_best(idx, epoch, metrics)
            
            scheduler.step()
            
            if self._is_early_stop():
                break
    
    def _save_best(self, fold, epoch, metrics):
        #Save best model
        to_save = {
            'epoch': epoch,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'best_f1': self.best_overall,
        }
        file_name = self._best_model_file_name(fold, epoch)
        if metrics.bleu_score > self.best_overall:
            self.logger.info(f"Best model found at fold {fold + 1} - epoch {epoch}, saving model")
            self.best_overall = metrics.bleu_score
            torch.save(to_save, os.path.join(self.weight_dir, file_name)) # only save best to prevent output memory exceed error

            self.stale = 0
        else:
            self.stale += 1  
                
        if epoch % SAVE_INTERVAL == 0 or epoch == self.epochs:
            self.logger.info(f"Save model at fold {fold + 1} - epoch {epoch}, saving model")
            torch.save(to_save, os.path.join(self.weight_dir, file_name))
            
    def _best_model_file_name(self, fold, epoch):
        end_time = datetime.datetime.now()
        name = f"{self.start_time}_{end_time}_fold-{fold + 1}_epoch-{epoch}_{INPUT_SIZE[0]}_BS-{BATCH_SIZE}_bleu-score-{self.best_overall:.3f}.pth"
        return name
    
    def _is_early_stop(self) -> bool:
        if self.stale > EARLY_STOP_THRESHOLD:
            self.logger.info(f"No improvement {EARLY_STOP_THRESHOLD} consecutive epochs, early stopping")
            return True
        return False
        

## Train model

In [18]:
folds, vocab = split_dataset(CAPTIONS_DIR, N_SPLITS)



In [19]:
model = CNNtoRNN(EMBED_SIZE, HIDDEN_SIZE, len(vocab), num_layers = 1).to(DEVICE)
criterion = nn.CrossEntropyLoss(ignore_index=vocab.stoi["<PAD>"])
optimizer = optim.Adam(model.parameters(), lr=BASE_LR)

# Only finetune the CNN
for name, param in model.encoderCNN.inception.named_parameters():
    if "fc.weight" in name or "fc.bias" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False
        
train = Trainer(folds, vocab, MAX_EPOCHS, model, optimizer, criterion)
train.run()

[0;37m2024-02-08 09:26:50,852 - CNNtoRNN - INFO - Device: cuda (1707100198.py:57)[0m
[0;37m2024-02-08 09:26:50,853 - CNNtoRNN - INFO - Input size: (384, 384) (1707100198.py:61)[0m
[0;37m2024-02-08 09:26:50,853 - CNNtoRNN - INFO - Batch size: 16 (1707100198.py:62)[0m
[0;37m2024-02-08 09:26:50,854 - CNNtoRNN - INFO - Base learning rate: 0.1 (1707100198.py:63)[0m
[0;37m2024-02-08 09:26:50,854 - CNNtoRNN - INFO - Max epochs: 200 (1707100198.py:64)[0m
[0;37m2024-02-08 09:26:50,855 - CNNtoRNN - INFO - Weight decay: 1e-05 (1707100198.py:65)[0m
  return F.conv2d(input, weight, bias, self.stride,
Fold[1/2] Epoch[1/200] Train: 100%|██████████| 1289/1289 [12:18<00:00,  1.75batch/s, Loss: 5.197]
Fold[1/2] Epoch[1/200] Eval: 100%|██████████| 1289/1289 [42:03<00:00,  1.96s/batch, BLEU Score: 0.025]
[0;37m2024-02-08 10:21:13,306 - CNNtoRNN - INFO - Best model found at fold 1 - epoch 1, saving model (330939360.py:92)[0m
Fold[1/2] Epoch[2/200] Train: 100%|██████████| 1289/1289 [12:30<00:0