In [None]:
import os
import numpy as np
import random
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from nltk.tokenize import word_tokenize
from PIL import Image
from tqdm import tqdm
from collections import Counter
import matplotlib.pyplot as plt
import torch
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.optim import Adam
import torch.nn as nn
import pandas as pd

from  customDatasetFromCSV import CustomDatasetFromCSV

CREATE DATASET


In [None]:
captions_csv= './filesCSV/captions.csv'
vocab_csv ='./filesCSV/vocab.csv'
data_dir = "./imagesTrainVal/train2017/"


transform = transforms.Compose([
    transforms.Resize((224, 224)),  
    transforms.ToTensor(),           
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalization --> For making the models more robust to low quality images
])
dataset = CustomDatasetFromCSV(data_dir,captions_csv,vocab_csv,transform=transform,percentage=100)

CHECK DEVICE

In [None]:
import torch

print("CUDA disponible:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Nombre de la GPU:", torch.cuda.get_device_name(0))
    print("Capacidad de la GPU:", torch.cuda.get_device_capability(0))
if torch.cuda.is_available():
    
    device = torch.device("cuda:0")
    print("Using GPU:", torch.cuda.get_device_name(device))
else:
    # If no GPU is available, use the CPU
    device = torch.device("cpu")
    print("No GPU available. Using CPU.")

TEST IF THE DATASET IS CORRECT AND IF THE METHODS ARE WORKING

In [None]:

print(dataset.__len__())
print("Vocab: ",dataset.vocab)
print(f'all captions are the same size: {dataset.allCaptionsSameSize()}')
train_item_caption= dataset.__getitem__(43)[1]
print(f'We get a random caption of the train:´{train_item_caption} ')
print(f'Print first ten captions')
for i in range(1,11):
    print(dataset.get_caption(i))
    print(dataset.get_text_caption(i))
dataset.print_image_with_caption(45677)

I CRETE THE ENCODER-DECODER ARQUITECTURE


In [None]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size,device):
        super(EncoderCNN, self).__init__()
        from torchvision.models.resnet import resnet50
        resnet = resnet50(pretrained=True)
        self.device = device
        # disable learning for parameters
        for param in resnet.parameters():
            param.requires_grad_(True)

        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules).to(self.device)
        self.embed = nn.Linear(resnet.fc.in_features, embed_size).to(self.device)

    def forward(self, images):
        features = self.resnet(images)
        features = features.view(features.size(0), -1)
        features = self.embed(features)
        return features
# --------- Decoder ----------


class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size,caption_lenght ,num_layers,device):
        super(DecoderRNN, self).__init__()
        self.device = device
        self.num_layers= num_layers
        self.caption_lenght = caption_lenght
        self.hidden = None
        self.hidden_dim = hidden_size
        self.embed_size= embed_size
        # Embedding layer to convert token IDs to embeddings
        self.embed = nn.Embedding(vocab_size, embed_size).to(self.device)
        # LSTM layer that takes embeddings and hidden states as input
        self.lstm = nn.LSTM(embed_size*2, hidden_size, num_layers, batch_first=True).to(self.device)
        # Linear layer to produce the vocabulary distribution
        self.linear = nn.Linear(hidden_size, vocab_size).to(self.device)
    # Initialize values for hidden and cell states    
    def init_hidden(self, batch_size,device):
        """
        Initialize the hidden state.
        """
        # Create initial hidden and cell states filled with zeros
        return (
            torch.zeros(self.num_layers, batch_size, self.hidden_dim,device=device),
            torch.zeros(self.num_layers, batch_size, self.hidden_dim, device=device)
        )
    def forward(self, features, captions):
        # Embed the start token
        input_token = torch.tensor([0], device=self.device) 
        input_token = input_token.expand(features.size(0)).unsqueeze(1)
        # Initialize the hidden state for the beginning of the sequence
        hidden = self.init_hidden(features.size(0), self.device)
        outputs = []

        for t in range(self.caption_lenght):
            embeddings = self.embed(input_token)
            # print("embeddings shape: "+ str(embeddings.shape))
            # Combine the image features and embedded captions as input
            inputs = torch.cat((features.unsqueeze(1), embeddings), dim=2)
            # print("inputs shape: "+ str(inputs.shape))
            # Pass the inputs and hidden states through the LSTM
            lstm_out, hidden = self.lstm(inputs, hidden)
            # print("lstm_out shape: "+ str(lstm_out.shape))
            # Pass the LSTM outputs through the linear layer
            output = self.linear(lstm_out)
            # print("output shape: "+ str(output.shape))
            # Use ground truth token as the input at the next time step
            input_token = captions[:, t].unsqueeze(1).to(self.device)
            outputs.append(output)

        # Concatenate the output tensors along the sequence dimension
        outputs = torch.cat(outputs, dim=1)

        return outputs


    def generate(self, features):
        """
        Forward pass of the decoder.
        Args:
            features: Output from the encoder (image features)

        Returns:
            outputs: Predicted token scores (before softmax)
        """
        # Embed the start token
        input_token = torch.tensor([0], device=self.device)  # Start token
        input_token = input_token.expand(features.size(0)).unsqueeze(1).to(self.device)  # Move to the correct device

        # Initialize the hidden state for the beginning of the sequence
        hidden = self.init_hidden(features.size(0), self.device)
        outputs = []

        for _ in range(self.caption_lenght):  # Fixed caption length
            embeddings = self.embed(input_token)

            # Combine the image features and embedded captions as input
            inputs = torch.cat((features.unsqueeze(1), embeddings), dim=2)

            # Pass the inputs and hidden states through the LSTM
            lstm_out, hidden = self.lstm(inputs, hidden)

            # Pass the LSTM outputs through the linear layer
            output = self.linear(lstm_out)

            # Update the input token for the next time step
            input_token = output.argmax(2).to(self.device)  # Use argmax to get the next token

            # Handle end token and padding token
            input_token[input_token == 2] = 0  # Replace end token with padding token
            outputs.append(output)
            

        outputs = torch.cat(outputs, dim=1)

        return outputs

class Model1(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, caption_length, num_layers,device):
        super(Model1, self).__init__()
        self.encoder = EncoderCNN(embed_size,device)
        self.decoder = DecoderRNN(embed_size, hidden_size, vocab_size, caption_length, num_layers,device)
        self.encoder.to(device)
        self.decoder.to(device)
    def forward(self, images, captions):
        features = self.encoder(images)
        outputs = self.decoder(features, captions)
        return outputs
    def generate(self,image):
        features= self.encoder(image)
        return self.decoder.generate(features)

HYPERPARAMETERS

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn.functional as F



# Definition the hyperparameters
embed_size = 256
hidden_size = 512
num_layers = 2
vocab = dataset.vocab
vocab_size = len(vocab)

# Initialize 
model = Model1(embed_size, hidden_size, len(dataset.vocab),dataset.maxCaptionLength, num_layers,device)

model.encoder.to(device)
model.decoder.to(device)
model.to(device)

# atribbutes of the loop
learning_rate = 0.001
num_epochs = 3

# Define the loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


# Create dataLoaders
batch_size = 64  # More for my computer was too bad

#data loaders 
train_loader = DataLoader(dataset, batch_size=batch_size, sampler=SubsetRandomSampler(range(len(dataset.train_data))) )
val_loader = DataLoader(dataset, batch_size=batch_size, sampler=SubsetRandomSampler(range(len(dataset.train_data), len(dataset.train_data) + len(dataset.val_data))))
test_loader = DataLoader(dataset, batch_size=batch_size*6, sampler=SubsetRandomSampler(range(len(dataset.train_data) + len(dataset.val_data), len(dataset))))

len(train_loader)

NUMBER OF TRAINABLE PARAMETERS

In [None]:

# Function to count the number of parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Make sure to move the model to the correct device
model.to(device)

# Print the total number of parameters
total_params = count_parameters(model)
print(f'Total number of parameters: {total_params}')

CHECK CONSISTENCY

In [None]:
def check_device_consistency(model):
    def get_device(obj):
        return obj.device if hasattr(obj, "device") else None

    def check_device_match(device_list):
        return all(device == device_list[0] for device in device_list)

    param_devices = [get_device(param) for param in model.parameters()]
    buffer_devices = [get_device(buf) for buf in model.buffers()]

    all_devices = param_devices + buffer_devices
    is_consistent = check_device_match(all_devices)

    return is_consistent

# Call the function to check device consistency
is_device_consistent = check_device_consistency(model)

if is_device_consistent:
    print("All parameters and buffers are on the same device.")
else:
    print("Parameters and buffers are on different devices.")



TRAINING LOOP

In [None]:
import os
def train_model(search_for_existing_model=False):
    # Define the directory where you want to save the model
    save_dir = 'SAVED_MODELS_2024/model5'

    # Create the directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    # Initialize variables for tracking the best model and best validation loss
    best_model = None
    best_val_loss = float('inf')
    start_epoch = 0  # Start epoch counter
    
    for epoch in range(start_epoch, num_epochs):
        model.train()  # Set the model to training mode
        batch_counter = 0  # Initialize batch counter
        total_batches = len(train_loader)  # Total number of batches in the training set
        train_losses = []

        for images, captions in tqdm(train_loader, desc=f'Epoch {epoch + 1}'):
            # Move data to the appropriate device (GPU or CPU)
            images = images.to(device)
            captions = captions.to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(images, captions)  # Exclude the last word for input

            # Calculate the loss
            loss = F.cross_entropy(outputs.view(-1, vocab_size), captions.contiguous().view(-1))

            # Backpropagation
            loss.backward()

            # Update the weights
            optimizer.step()

            train_losses.append(loss.item())
            batch_counter += 1

            # Print loss and generated output every 50 batches
            if batch_counter % 100 == 0:
                with torch.no_grad():
                    print("################### TRAINING METHOD ################### ")
                    generated_output_ids = outputs[0].argmax(dim=1).tolist()
                    generated_words = [dataset.id_to_token.get(token_id, "UNK") for token_id in generated_output_ids if token_id != 0]
                    generated_caption_str = " ".join(generated_words)
                    real_caption_ids = captions[0].tolist()
                    real_caption_words = [dataset.id_to_token.get(int(token_id), '<UNK>') for token_id in real_caption_ids if token_id != 0]
                    real_caption_str = " ".join(real_caption_words)
                    print(f"Epoch [{epoch + 1}/{num_epochs}] - Batch [{batch_counter}/{total_batches}] - Loss: {loss.item():.4f}")
                    print(f"Generated Output: {generated_caption_str}")
                    print(f"Real Caption: {real_caption_str}")
            if batch_counter % 100 == 0:
                with torch.no_grad():
                    print("################### GENERATE METHOD ################### ")
                                
                    image = images[0].cpu().numpy()  #
                    image = np.transpose(image, (1, 2, 0))

                    plt.imshow(image)
                    plt.axis('off')
                    plt.show()
                    plt.show()
                    generated_outputs = model.generate(images)
                    generated_output_ids = generated_outputs.argmax(dim=2).tolist()
                    generated_words = [dataset.id_to_token.get(token_id, "UNK") for token_id in generated_output_ids[0] if token_id != 0]
                    generated_caption_str = " ".join(generated_words)
                    real_caption_ids = captions.tolist()
                    real_caption_words = [dataset.id_to_token.get(int(token_id), '<UNK>') for token_id in real_caption_ids[0] if token_id != 0]
                    real_caption_str = " ".join(real_caption_words)
                    print(f"Generated Output (Generated): {generated_caption_str}")
                    print(f"Real Caption: {real_caption_str}")
        avg_train_loss = sum(train_losses) / len(train_losses)

        

        # Validation loop
        model.eval()  # Set the model to evaluation mode
        total_val_loss = 0.0

        with torch.no_grad():
            val_losses = []
            for images, captions in val_loader:
                # Move data to the appropriate device
                images = images.to(device)
                captions = captions.to(device)
                optimizer.zero_grad()
                outputs = model(images, captions)
                val_loss = F.cross_entropy(outputs.view(-1, vocab_size), captions.contiguous().view(-1))
                val_losses.append(val_loss.item())
                total_val_loss += val_loss.item()

        avg_val_loss = sum(val_losses) / len(val_losses)
        # Save the model after each epoch
        model_path = os.path.join(save_dir, f'model_epoch_{epoch + 1}_11_06_2024_val_loss_{avg_val_loss}.pth')
        torch.save(model.state_dict(), model_path)
        print(f"Epoch [{epoch + 1}/{num_epochs}] - Training Loss: {avg_train_loss:.4f} - Validation Loss: {avg_val_loss:.4f}")

        
    print("Training completed.")

train_model(search_for_existing_model=True)    

CHOOSING THE TRAINED MODEL TO TEST

In [None]:
embed_size = 256
hidden_size = 512
num_layers = 2
vocab = dataset.vocab
vocab_size = len(vocab)
model = Model1(embed_size, hidden_size, len(dataset.vocab),dataset.maxCaptionLength, num_layers,device)
model.encoder.to(device)
model.decoder.to(device)
model.to(device)

model_path= "./SAVED_MODELS_2024/Model5/model_epoch_1_11_06_2024_val_loss_0.5446846502536052.pth"
model.load_state_dict(torch.load(model_path),strict=False)

NOW TESTING

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
import csv




# Create a CSV file to store real and generated captions
csv_file = './Results2024/ModeL5/model_epoch_1_11_06_2024_val_loss_0.5446846502536052.pth'
with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Real Caption', 'Generated Caption'])

# Testing loop on the test dataset
model.eval()  # Set the model to evaluation mode

with torch.no_grad():
    batch_counter = 0  # Initialize batch counter
    total_batches = len(test_loader)  # Total number of batches in the training set
    for images, captions in tqdm(test_loader, desc='Testing'):
        # Move data to the appropriate device
        images = images.to(device)
        captions = captions.to(device)
        outputs = model.generate(images)  # Assuming your model generates captions given images
        if batch_counter % 20 == 0:
            with torch.no_grad():
                generated_output_ids = outputs[0].argmax(dim=1).tolist()
                generated_words = [dataset.id_to_token.get(token_id, "UNK") for token_id in generated_output_ids if token_id != 0]
                generated_caption_str = " ".join(generated_words)
                real_caption_ids = captions[0].tolist()
                real_caption_words = [dataset.id_to_token.get(int(token_id), '<UNK>') for token_id in real_caption_ids if token_id != 0]
                real_caption_str = " ".join(real_caption_words)
                print(f"Generated Output: {generated_caption_str}")
                print(f"Real Caption: {real_caption_str}")
        for i in range(len(images)):
            generated_output_ids = outputs[i].argmax(dim=1).tolist()
            generated_words = [dataset.id_to_token.get(token_id, "UNK") for token_id in generated_output_ids if token_id != 0]
            generated_caption_str = " ".join(generated_words)
            real_caption_ids = captions[i].tolist()
            real_caption_words = [dataset.id_to_token.get(int(token_id), '<UNK>') for token_id in real_caption_ids if token_id != 0]
            real_caption_str = " ".join(real_caption_words)

            # Append the real and generated captions to the CSV file
            with open(csv_file, mode='a', newline='') as file:
                writer = csv.writer(file)
                writer.writerow([real_caption_str, generated_caption_str])
        batch_counter += 1
print("Testing completed. Real and generated captions saved.")


METRICS CALCULATION

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# Download NLTK data
nltk.download('punkt')

# Load the CSV file
csv_file = './Results2024/Model5/model_epoch_1_11_06_2024_val_loss_0.5446846502536052.csv'
df = pd.read_csv(csv_file)

# Lists to store metric scores
bleu_scores = []
rouge_l_scores = []

# Smoothing function for BLEU score
smoother = SmoothingFunction().method1
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

for idx, row in df.iterrows():
    real_caption = " ".join(word_tokenize(row['Real Caption']))  # Join the words into a string
    generated_caption = " ".join(word_tokenize(row['Generated Caption']))  # Join the words into a string

    real_caption_tokenized = word_tokenize(real_caption)
    generated_caption_tokenized = word_tokenize(generated_caption)
    
    # BLEU Score
    bleu = sentence_bleu([real_caption], generated_caption, smoothing_function=smoother)
    bleu_scores.append(bleu)
    
    # ROUGE-L Score
    rouge_scores = scorer.score(real_caption, generated_caption)
    rouge_l_f1 = rouge_scores['rougeL'].fmeasure
    rouge_l_scores.append(rouge_l_f1)

# Add metric columns to the DataFrame
df['BLEU Score'] = bleu_scores
df['ROUGE-L Score'] = rouge_l_scores

# Calculate mean scores
mean_bleu_score = sum(bleu_scores) / len(bleu_scores)
mean_rouge_l_score = sum(rouge_l_scores) / len(rouge_l_scores)

# Print the DataFrame with scores and mean scores
print(df)
print("\nMean Metrics:")
print(f"Mean BLEU Score: {mean_bleu_score:.4f}")
print(f"Mean ROUGE-L Score: {mean_rouge_l_score:.4f}")


In [None]:
from PIL import Image
from torchvision import transforms
import torch
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn.functional as F
import cv2

model_path= "./SAVED_MODELS/model1/best_model_epoch_5.pth"
# Define the hyperparameters
embed_size = 256
hidden_size = 512
num_layers = 1
vocab = dataset.vocab
vocab_size = len(vocab)

# Initialize the model
model = Model1(embed_size, hidden_size, len(dataset.vocab),dataset.maxCaptionLength, num_layers,device)

model.encoder.to(device)
model.decoder.to(device)
model.to(device)

model.load_state_dict(torch.load(model_path))



def load_and_preprocess_image_cv2(image_path):
    # Load and preprocess the image using OpenCV
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
    image = cv2.resize(image, (224, 224))  # Resize to (224, 224)
    image = image / 255.0  # Normalize pixel values to the range [0, 1]
    image = torch.tensor(image, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0)  # Add batch dimension
    return image
# Function to load and preprocess the image
def load_and_preprocess_image(image_path):
    image = Image.open(image_path).convert('RGB')
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ])
    image = transform(image)
    image = image.unsqueeze(0)  # Add batch dimension
    return image
def generate_caption(model, image_path, id_to_token, real_description=None):
    # Load and preprocess the image using OpenCV
    image = load_and_preprocess_image(image_path)

    # Move the model and image to the same device
    device = next(model.parameters()).device
    image = image.to(device)
    model.eval()
    model.to(device)

    # Display the image using OpenCV
    plt.imshow(cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

    with torch.no_grad():
        output = model.generate(image)
        # Use argmax(dim=2) to get the index of the predicted token
        generated_output_ids = output.argmax(dim=2).squeeze().tolist()
        generated_words = [id_to_token.get(token_id, "UNK") for token_id in generated_output_ids if token_id != 0]
        generated_caption_str = " ".join(generated_words)

    print("Generated description by the model: ", "###  ", generated_caption_str, "  ###")
    if real_description:
        print(f"Real description made by hand by the tester: ###  {real_description}  ###")

# Example usage:
generate_caption(model, './ImagenesPrueba/20220511_155217.jpg', dataset.id_to_token, real_description="A man taking an image at a bathroom")
generate_caption(model, './ImagenesPrueba/20230413_031752.jpg', dataset.id_to_token,real_description=" A man taking an image at a bathroom")



# Loop over each image in the folder
folder_path = './ImagenesPrueba/'
for filename in os.listdir(folder_path):
    if filename:#.endswith(".jpg")
        image_path = os.path.join(folder_path, filename)

        # Print the image filename
        print(f"Image: {filename}")

        # Use the generate_caption function
        generate_caption(model, image_path, dataset.id_to_token)
        print("\n")