In [None]:
!pip install omegaconf, einops, pytorch_lightning

In [1]:
from omegaconf import OmegaConf
from ldm.util import instantiate_from_config
import torch
from safetensors import safe_open
from einops import rearrange

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def load_model_from_config(config, sd):
    model = instantiate_from_config(config)
    model.load_state_dict(sd,strict=False)
    model.cuda()
    model.eval()
    return model

def load_safetensors(config_file='configs/autoencoder/ae_only.yaml', ckpt='../stable-diffusion-models/vae/diffusion_pytorch_model.safetensors'):
    config = OmegaConf.load(config_file)
    tensors = {}
    with safe_open(ckpt, framework="pt", device=0) as f:
        for k in f.keys():
            tensors[k] = f.get_tensor(k)
    model = load_model_from_config(config.model, tensors)
    return model

def load_model(config_file='configs/autoencoder/ae_only.yaml', ckpt='../stable-diffusion-models/vae/diffusion_pytorch_model.safetensors'):
    config = OmegaConf.load(config_file)
    if ckpt:
        print(f"Loading model from {ckpt}")
        pl_sd = torch.load(ckpt, map_location="cpu")
        print(type(pl_sd))
    else:
        pl_sd = {"state_dict": None}
    model = load_model_from_config(config.model, pl_sd)

    return model

def get_input(batch):
        x = batch
        if len(x.shape) == 3:
            x = x[..., None]
        x = rearrange(x, 'b h w c -> b c h w')
        x = x.cuda()
        return x

# load config file 'configs/autoencoder/ae_only.yaml' in config



In [7]:
model = load_safetensors(ckpt='../../stable-diffusion-models/vae/diffusion_pytorch_model.safetensors')

# and then use it as model.encode(get_input(x), return_hidden=True)[1]



making attention of type 'vanilla' with 512 in_channels
Working with z of shape (1, 4, 32, 32) = 4096 dimensions.
making attention of type 'vanilla' with 512 in_channels


In [5]:
import numpy as np
import torch

In [1]:
from ldm.util import load_safetensors, get_input

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = load_safetensors()



making attention of type 'vanilla' with 512 in_channels
Working with z of shape (1, 4, 32, 32) = 4096 dimensions.
making attention of type 'vanilla' with 512 in_channels


In [17]:
inp = torch.from_numpy(np.zeros((1, 64, 64, 3), np.float32)).cuda()

In [18]:
inp = get_input(inp)
latent, outputs = model.encode(inp, return_hidden=True)

In [21]:
torch.flatten(outputs[1])

tensor([-0.2858, -0.4730, -0.5461,  ..., -0.1655, -0.3258,  0.0375],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>)

In [None]:
def get_image_features(inp, layer=-1): # needs inp with 3 channels
    inp = get_input(inp)
    gd, outputs = model.encode(inp, return_hidden=True)
    if layer > len(outputs):
        # return np.reshape(np.stack(outputs.hidden_states)[:, :, 0], (1, -1))
        result = torch.cat([torch.flatten(out) for out in outputs])
    if layer == -1:
        result = gd.sample()  # gd.mean()
    else:
        result = torch.flatten(outputs[layer])
    return np.asarray(result.cpu())

def get_features(inp, **args):
    return get_image_features(inp, **args)