In [95]:
import numpy as np
from PIL import Image
import os
import string
import tensorflow as tf
import keras
import re
from pickle import dump, load
# from tf.keras.applications.xception import Xception, preprocess_input
# from tensorflow.keras.preprocessing.image import load_img, img_to_array
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tf.keras.utils import to_categorical
# from tf.keras.utils import to_categorical
# from tf.keras.layers import add, Input, Dense, LSTM, Embedding, Dropout
# from tf.keras.models import Model, load_model
from tqdm.notebook import tqdm
tqdm().pandas()

0it [00:00, ?it/s]

In [96]:
# Load, process/clean, and save image text dataset (id + captions)

def load_file(filepath: str) -> str:
    """
    @param filepath path of the file to be loaded
    @return the contents of the file
    """
    file = open(filepath, "r")
    text = file.read()
    file.close()
    return text

def get_descriptions(dataset_text_filepath: str) -> dict[str, list[str]]:
    """
    @param dataset_text_filepath path of the file containing image ids and captions
        >>> file contents
        1000268201_693b08cb0e.jpg#0\tA child in a pink dress is climbing up a set of stairs in an entry way .\n
        1000268201_693b08cb0e.jpg#1\tA girl going into a wooden building .\n
        ...
    @return a dictionary (key: image id -> value: array of image captions)
    """
    text = load_file(dataset_text_filepath)
    entries = text.split("\n")

    descriptions = {}
    for entry in entries:
        if entry == "":
            continue
        img_id, caption = entry.split("\t")
        img_id = img_id[:-2] # Strip numbers off id (ie 1000268201_693b08cb0e.jpg#0 -> 1000268201_693b08cb0e.jpg)
        if img_id not in descriptions:
            descriptions[img_id] = [caption]
        else:
            descriptions[img_id].append(caption)

    return descriptions

def clean_descriptions(descriptions: dict[str, list[str]]) -> None:
    """
    Clean the entries in the descriptions dictionary in-place.
    Convert all letters to lowercase, removes punctuation, removes hanging "s" and "a"s,
    removes words containing numbers, and removes duplicate whitespace

    @param descriptions a dictionary (key: image id -> value: array of image captions)
    """
    for img_id, captions in descriptions.items():
        for i, caption in enumerate(captions):
            # Convert to lowercase
            caption = caption.lower()

            # Remove punctuation
            caption = caption.translate(str.maketrans("", "", string.punctuation))

            # Remove hanging "s" and "a"s
            caption = caption.replace(" s ", " ")
            caption = caption.replace(" a ", " ")

            # Remove words with letters
            caption = re.sub(r"\w*\d\w*", "", caption)

            # Remove duplicate whitespace
            caption = " ".join(caption.split())

            descriptions[img_id][i] = caption

def get_vocab(descriptions: dict[str, list[str]]) -> set[str]:
    """
    @param descriptions a dictionary (key: image id -> value: array of image captions)
    @return set of all words used in captions
    """
    vocab = set()
    for img_id, captions in descriptions.items():
        [vocab.update(words.split()) for words in captions]
    return vocab

def save_descriptions(filename: str, descriptions: dict[str, list[str]]) -> None:
    """
    Write the descriptions back to a file

    @param filename the name of the file to write the descriptions to
    @param descriptions a dictionary (key: image id -> value: array of image captions)
    """
    lines = list()
    for img_id, captions in descriptions.items():
        for caption in captions:
            description = img_id + "\t" + caption
            lines.append(description)

    data = "\n".join(lines)
    file = open(filename, "w")
    file.write(data)
    file.close()

In [None]:
# Process image dataset

def extract_features(filepath)

In [97]:
dataset_text_dirname = "Flicker8k_text"
dataset_image_dirname = "Flicker8k_dataset"
text_filename = "Flickr8k.token.txt"

text_path = dataset_text_dirname + "/" + text_filename
descriptions = get_descriptions(text_path)
clean_descriptions(descriptions)

vocab = get_vocab(descriptions)

save_descriptions("descriptions.txt", descriptions)