In [1]:
import torch
import os
import csv
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import PIL
from PIL import Image
from transformers import CLIPTokenizer


In [2]:
from ldm.dpm.ddpm import DDPM
from ldm.vae.encoder import Encoder
from ldm.vae.decoder import Decoder
from  ldm.clip.cliper import CLIP

In [17]:
class ImageDataset(Dataset):
    def __init__(self, root_dir, csv_file):
        self.root_dir = root_dir
        self.csv_file = csv_file
        gen=torch.Generator()
        self.diff=DDPM(gen)
        self.tokenizer = CLIPTokenizer("utils/vocab.json", merges_file="utils/merges.txt")

        self.txt_encoder=CLIP()
        
        self.time_steps=int(self.diff.num_train_timesteps)
        self.encoder=Encoder()
        self.transform= transforms.Compose([
            transforms.Lambda(lambda t: (t/255)),
            transforms.Lambda(lambda t: (t * 2) - 1),
            transforms.Lambda(lambda t: t.permute([2,0,1])), # Scale data between [-1, 1] 
])



        with open(self.csv_file, "r") as f:
            reader = csv.reader(f)
            self.image_paths = []
            self.txt=[]
            
            for row in reader:
                image,txt = row[1], row[3]
                self.image_paths.append(os.path.join(self.root_dir,image))
                self.txt.append(txt)
                
                
              

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, index):
        image = self.image_paths[index]

        
        t = torch.randint(0, self.time_steps,(1,))

        image = torch.from_numpy(np.array((Image.open(image))))
        image=(self.transform(image)).unsqueeze(0)
        print(image.shape)
        

        with torch.no_grad():
        
            noisy_image=self.diff.add_noise(image,t)
            noisy_image=self.encoder(noisy_image)
            text_guiance=self.txt_encoder(self.tokenizer.batch_encode_plus([self.txt[index]],padding="max_length", max_length=77).input_ids)
            

        
   

        return image,self.encoder(noisy_image),t,text_guiance




In [18]:
bloc=ImageDataset("/home/essey/datastore/ViT-512","/home/essey/datastore/csv-files/joined-data-512-2.csv")

In [19]:
bloc[1]

torch.Size([1, 3, 512, 512])


RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 4 is not equal to len(dims) = 3

In [4]:
tokenizer = CLIPTokenizer("utils/vocab.json", merges_file="utils/merges.txt")

In [9]:
with torch.no_grad():
    bloc1=Encoder()
    #bloc1=torch.load('data/encoder_params.pt')
    v=torch.randn(1,3,512,512)
    v=bloc1(v)
    

In [10]:
v.shape

torch.Size([1, 4, 64, 64])

In [11]:
prompt="the quick brown fox jumps over the lazy hound"

In [12]:
token_prompt=tokenizer.batch_encode_plus(
                [prompt], padding="max_length", max_length=77
            ).input_ids

In [13]:
token_prompt

[[49406,
  518,
  3712,
  2866,
  3240,
  18911,
  962,
  518,
  10753,
  13561,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407,
  49407]]