In [1]:
import pandas as pd
import numpy as np

from PIL import Image
import glob
import h5py
import string

import torch
from torchvision import transforms

import nltk
from nltk.tokenize import RegexpTokenizer

import json

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# torch.cuda.get_device_name(0)

In [2]:
# nltk.download('popular')

In [3]:
with open("./caption_datasets/dataset_flickr8k.json", "r") as f:
    cap_data = json.load(f)

train, test, val = set(), set(), set()

file_split = map(lambda a: (a['filename'], a['split']), cap_data['images'])

for file, split in file_split:

    if split == 'train':
        train.add(file)
    if split == 'test':
        test.add(file)
    if split == 'val':
        val.add(file)

print(len(train), len(test), len(val))

6000 1000 1000


In [4]:
def createHDF5(paths, split):

    for path in paths:
        img = Image.open(path)
        
        tensor_img = transforms.Compose([
            transforms.PILToTensor(),
            transforms.Resize([256,256]),
            transforms.ConvertImageDtype(torch.float),
        ])

        image_tensor_list.append(tensor_img(img))

    with h5py.File("./dataset/image_dataset_"+split+".hdf5", "w") as f:
        f.create_dataset("default", data=images)

In [5]:
imageDir = "./Images/"

img_paths = glob.glob(imageDir + "*")

image_tensor_list = []

for path in img_paths:

    img = Image.open(path)
    # img.show()
    
    tensor_img = transforms.Compose([
        transforms.PILToTensor(),
        transforms.Resize([256,256]),
        transforms.ConvertImageDtype(torch.float),
    ])

    image_tensor_list.append(tensor_img(img))

    # back_to_image = transforms.ToPILImage()
    # back_to_image(image_tensor_list[0]).show()


images = torch.stack(image_tensor_list)

images.shape

torch.Size([8091, 3, 256, 256])

In [6]:
with h5py.File("./dataset/imageDataset.hdf5", "w") as f:
    dset = f.create_dataset("default", data=images)

In [7]:
captions_path = "./captions.txt"

img_caption_list = pd.read_csv(captions_path)
captions = img_caption_list['caption']
tokenizer = RegexpTokenizer(r'\w+')
captions = captions.apply(lambda row : tokenizer.tokenize(row.lower()))

captions.to_json("./dataset/captions.json",orient='records')

In [8]:
caption_length = captions.apply(lambda row : len(row))
caption_length.to_json("./dataset/caption_lengths.json", orient='records')

In [9]:
wordCount = {}

for cap in captions:
    for word in cap:
        wordCount[word] = wordCount.get(word, 0) + 1

sortedWords = sorted(wordCount.items(), key=lambda a: -a[1])

k = 5000
word_to_index = {word[0]: i for i, word in enumerate(sortedWords[:k])}


tokens = {"<sos>": k+1, "<eos>": k+2, "<pad>": k+3, "<unk>": k+4}

word_to_index.update(tokens)

index_to_word = {v: k for k, v in word_to_index.items()}

with open("./dataset/word_to_index_map.json", "w") as f:
    json.dump(word_to_index, f)

with open("./dataset/index_to_word.json", "w") as f:
    json.dump(index_to_word, f)



In [10]:
baseCaptionLen = max(map(len, captions)) + 2
baseCaptionLen

39

In [11]:
def padEachCaption(caption, maxlen):

    padlist = ["<pad>"]*(maxlen-2-len(caption))
    return ["<sos>"]+caption+["<eos>"]+padlist

padded_captions = list(map(lambda a: padEachCaption(a, baseCaptionLen), captions))


with open("./dataset/tokenized_captions.json", "w") as f:
    json.dump(padded_captions, f)