In [1]:
#convert text to numerical values
# need a vocab mapping each word to a index
# Pytorch dataset to load the data
# set up a padding of every batch. 

# set up a dataloader. 

In [1]:
import os
import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import torchvision.transforms as transforms

In [2]:
class Vocabulary:
    def __init__(self, freq_threshold):
        self.freq_threshold = freq_threshold
        self.stoi = {'<PAD>':0, '<SOS>':1, '<EOS>':2, '<UNK>':3}
        self.itos = {0:'<PAD>', 1: '<SOS>', 2:'<EOS>', 3:'<UNK>'}
    
    def __len__(self):
        return len(self.itos)
    
    def build_vocabulary(self):
        hindi_alphabets = [chr(alpha) for alpha in range(2304, 2432)]
        hindi_alphabet_size = len(hindi_alphabets)
        for index, alpha in enumerate(hindi_alphabets):
            if alpha not in self.stoi:
                self.stoi[alpha] = index+1
        return self.stoi

    def numericalized(self, word):
        gt_rep = torch.zeros([len(word), 1], dtype=torch.long)
        for letter_index, letter in enumerate(word):
            print(letter)
            pos = self.stoi[letter]
            gt_rep[letter_index][0] = pos
        #gt_rep[letter_index+1][0] = self.stoi[pad_char]
        return gt_rep



In [3]:
vb = Vocabulary(2)
l2i = vb.build_vocabulary()
vb.numericalized('बाहर')

ब
ा
ह
र


tensor([[45],
        [63],
        [58],
        [49]])

In [36]:
class FlickerDataset(Dataset):
    def __init__(self, root_dir, captions_file, transform=None, freq_threshold=5):
        self.root_dir = root_dir
        self.df = pd.read_excel(captions_file)
        print(self.df.head())
        self.caption_file = captions_file
        self.transform = transform

        self.imgs = self.df["filename"]
        self.captions = self.df["word"]

        #Initialize vocab and build vocab

        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocabulary()
        

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        caption = self.caption[index]
        img_id = self.imgs[index]
        img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")

        if self.transform is not None:
            img = self.transform(img)
        
        numericalized_caption = [self.vocab.stoi["<SOS>"]]
        numericalized_caption += self.vocab.numericalized(caption)
        numericalized_caption.append(self.vocab.stoi["<EOS>"])

        return img, torch.tensor(numericalized_caption)
    

In [37]:
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
    
    def __call__(self, batch):
        imgs = [item[0].unsqueeze(0) for item in batch]
        imgs = torch.cat(imgs, dim=0)
        targets = [item[1] for item in batch]
        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)

        return imgs, targets
    

In [42]:

def get_loader(
        root_folder,
        annotation_file,
        transform,
        batch_size=32,
        num_workers=8,
        shuffle=True,
        pin_memory=True
):
    dataset = FlickerDataset(root_folder, annotation_file, transform=transform)
    pad_idx = dataset.vocab.stoi["<PAD>"]
    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn=MyCollate(pad_idx=pad_idx)
    )
    return loader




In [43]:
def main():
    data_transform = transforms.Compose(
        [
            transforms.Resize((224,224)),
            transforms.ToTensor(),
        ]
    )
    dataloader = get_loader("Data/Images/", annotation_file="Data/path_to_output_excel_file.xlsx", transform=data_transform)
    for idx, (imgs, captions) in enumerate(dataloader):
        print(imgs.shape)
        print(captions.shape)


if __name__ == "__main__":
    main()



          x1         x2         x3          x4          y1          y2  \
0  233.63464  271.41425  267.73364  229.954040    3.323353    6.229999   
1  192.47371  254.37694  254.17207  192.268840  260.995420  261.262630   
2  107.56752  166.65314  165.53746  106.451836    3.107311    5.439358   
3  385.29205  421.92517  399.32580  362.692700  115.051160  149.435040   
4  499.93628  569.57090  564.70090  495.066280   27.582123   36.568360   

           y3          y4  word       filename  
0   54.069360   51.162712   लोग   3644_लोग.jpg  
1  308.725280  308.458070  उनके  1053_उनके.jpg  
2   33.706670   31.374622    यह    1735_यह.jpg  
3  173.512770  139.128890   जेल   3122_जेल.jpg  
4   74.306305   65.320070    भर    2228_भर.jpg  


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'FlickerDataset' on <module '__main__' (built-in)>


KeyboardInterrupt: 