In [None]:
!pip install transformers torch nltk datasets Pillow




In [None]:
from transformers import VisionEncoderDecoderModel,ViTFeatureExtractor, ViTImageProcessor, AutoTokenizer,  BertTokenizer, BertForSequenceClassification
import torch
from PIL import Image
from nltk.translate.bleu_score import corpus_bleu
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from datasets import load_dataset
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW


# Constants
max_length = 15
num_beams = 6

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


dataset = load_dataset("biglam/dating-historical-color-images")

decade_labels = {
    0: '1930',
    1: '1940',
    2: '1950',
    3: '1960',
    4: '1970'
}


class HistoricalImageDataset(Dataset):
    def __init__(self, dataset, captions, transform=None):
        self.dataset = dataset
        self.captions = captions
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = item['image'].convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Append the date to the pre-generated caption
        date = decade_labels[item['label']]
        full_caption = self.captions[idx].rstrip() + f' during {date}'

        return image, full_caption




# Transformations for the image
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
    transforms.ToTensor(),
])

# Create datasets and data loaders
train_dataset = HistoricalImageDataset(dataset=dataset['train'], captions=captions, transform=transform)

# Create DataLoader
data_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)




In [None]:

optimizer = AdamW(model.parameters(), lr=4e-5)
num_epochs = 5
criterion = CrossEntropyLoss()
print_interval = 100

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for i, (images, captions) in enumerate(data_loader):
        images = images.to(device)
        inputs = tokenizer(captions, padding=True, max_length=max_length, return_tensors="pt")
        input_ids = inputs.input_ids.to(device)

        # Prepare shifted labels for the decoder
        labels = torch.roll(input_ids, -1, dims=1)
        labels = labels.to(device)

        optimizer.zero_grad()
        pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values.to(device)
        outputs = model(pixel_values=pixel_values, labels=labels)

        # Check and compute loss
        loss = outputs.loss if outputs.loss is not None else criterion(outputs.logits.view(-1, model.config.vocab_size), labels.view(-1))

        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if (i + 1) % print_interval == 0:
            # Print caption every 'print_interval' iterations
            print("Caption:", captions)
            print(f"Iteration [{i + 1}/{len(data_loader)}], Epoch {epoch + 1}/{num_epochs}, Total Loss: {total_loss}")


    print(f"Epoch {epoch + 1}/{num_epochs}, Total Loss: {total_loss}")




Caption: ('a mountain range with a river and mountains during 1950', 'a large building with a clock on top of it during 1940')
Iteration [100/663], Epoch 1/5, Total Loss: 219.33620190620422
Caption: ('a man standing on top of a hill holding a stick during 1930', 'a double decker bus parked in front of a brick building during 1970')
Iteration [200/663], Epoch 1/5, Total Loss: 387.8956495523453
Caption: ('a beach filled with lots of sand and water during 1970', 'a large group of people standing in front of a sign during 1960')
Iteration [300/663], Epoch 1/5, Total Loss: 542.3875263929367
Caption: ('two little girls standing next to each other during 1950', 'a man standing in front of a truck during 1940')
Iteration [400/663], Epoch 1/5, Total Loss: 692.8588055968285
Caption: ('a train on a train track during 1930', 'a large building with a clock on the front of it during 1950')
Iteration [500/663], Epoch 1/5, Total Loss: 836.9095251560211
Caption: ('a man standing next to an old fashione

In [None]:
from PIL import Image
import requests
from io import BytesIO
from nltk.translate.bleu_score import corpus_bleu
import nltk
nltk.download('punkt')
print("Evaluating metrics...")


def generate_caption_for_url_image(model, feature_extractor, url, max_length=16, num_beams=4):
    """
    Generate a caption for an image obtained from a URL.

    Parameters:
    model: The loaded VisionEncoderDecoderModel.
    feature_extractor: The loaded feature extractor for the model.
    url (str): URL of the image.
    max_length (int): Maximum length of the generated caption.
    num_beams (int): Number of beams for beam search.

    Returns:
    str: Generated caption for the image.
    """
    # Download the image from the URL
    response = requests.get(url)
    image = Image.open(BytesIO(response.content))

    # Process the image
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(model.device)

    # Generate caption
    model.eval()
    with torch.no_grad():
        output_ids = model.generate(pixel_values, max_length=max_length, num_beams=num_beams)

    # Decode the generated caption
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption

url = 'https://i.insider.com/4dd3dedbcadcbb92571b0000?width=1000&format=jpeg&auto=webp'
generated_caption = generate_caption_for_url_image(model, feature_extractor, url)
real_caption = 'a group of woman pose for a picture in the 1940s'
print(generated_caption)
real_tokens = nltk.word_tokenize(real_caption.lower())
generated_tokens = nltk.word_tokenize(generated_caption.lower())

# Convert the individual tokens into lists of sentences
real_caption_list = [real_tokens]
generated_caption_list = [generated_tokens]


file_path = "saved_images"

with open(file_path, "w") as file:
    for real_sent, generated_sent in zip(real_caption_list, generated_caption_list):
        real_sent_str = " ".join(real_sent)
        generated_sent_str = " ".join(generated_sent)

        file.write(f"Real Caption: {real_sent_str}\n")
        file.write(f"Generated Caption: {generated_sent_str}\n")

print(f"Captions saved to {file_path}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


 large group of people standing in front of a building during 1950a
BLEU score: 9.788429383461836e-232


In [None]:
count = 0

def numbers_match(string1, string2):
    """
    Check if any numbers present in string1 are also present in string2.

    Args:
    string1 (str): The first string to be analyzed.
    string2 (str): The second string to be analyzed.

    Returns:
    bool: True if any number in string1 is found in string2, False otherwise.
    """
    # Extracting numbers from the first string
    numbers_in_string1 = set(filter(str.isdigit, string1))

    # Checking if any number from string1 exists in string2
    for number in numbers_in_string1:
        if number in string2:
            return True

    return False

import os

directory_path = "saved_images"

count = 0

for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)

    with open(file_path, "r") as file:
        real_caption = file.readline().strip()
        generated_caption = file.readline().strip()

    if numbers_match(real_caption, generated_caption):
        print("Success!")
        count += 1
    else:
        print("No Match!")
        count += 1

print(f"Total matches found: {count}")

In [None]:


from typing import List

def evaluate_caption(generated_caption: str, reference_caption: str) -> float:
    """
    Evaluate a generated caption against a single reference caption using the BLEU score.

    Parameters:
    generated_caption (str): The caption generated by the model.
    reference_caption (str): A single reference caption.

    Returns:
    float: The BLEU score for the generated caption.
    """

    # Tokenize the captions into words
    tokenized_generated_caption = generated_caption.split()
    tokenized_reference_caption = reference_caption.split()

    # Format the generated caption for BLEU evaluation (list of lists)
    candidate = [tokenized_generated_caption]

    # Format the reference caption for BLEU evaluation (list of list of lists)
    references = [[tokenized_reference_caption]]

    # Compute the BLEU score
    bleu_score = corpus_bleu(references, candidate)
    return bleu_score
score = evaluate_caption(generated_caption, real_caption)
print(f"BLEU score: {score}")

correct = 0
total = 0
with torch.no_grad():
    for images, labels in data_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy: {accuracy:.2f}%')

Evaluating metrics...


NameError: ignored

In [None]:
# Calculate the F! Score

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Compute the confusion matrix
confusion_matrix = confusion_matrix(real_caption, generated_captino)


import numpy as np


# Calculate F1 score for each class
precision = np.diag(confusion_matrix) / np.sum(confusion_matrix, axis=1)
recall = np.diag(confusion_matrix) / np.sum(confusion_matrix, axis=0)
f1_score = 2 * (precision * recall) / (precision + recall)

# Print F1 scores for each class
for i, date_range in enumerate(["1930s", "1940s", "1950s", "1960s", "1970s"]):
    print(f"F1 Score ({date_range}): {f1_score[i]:.2f}")

# Calculate weighted average F1 score
weights = np.sum(confusion_matrix, axis=1) / np.sum(confusion_matrix)
weighted_average_f1_score = np.sum(f1_score * weights)

print(f"Weighted Average F1 Score: {weighted_average_f1_score:.2f}")
