### 🚀 For an interactive experience, head over to our [demo platform](https://var.vision/demo) and dive right in! 🌟

In [1]:
################## 1. Download checkpoints and build models
import os
import os.path as osp
import torch, torchvision
import random
import numpy as np
import PIL.Image as PImage, PIL.ImageDraw as PImageDraw
setattr(torch.nn.Linear, 'reset_parameters', lambda self: None)     # disable default parameter init for faster speed
setattr(torch.nn.LayerNorm, 'reset_parameters', lambda self: None)  # disable default parameter init for faster speed
from models import VQVAE, build_vae_var

MODEL_DEPTH = 16    # TODO: =====> please specify MODEL_DEPTH <=====
assert MODEL_DEPTH in {16, 20, 24, 30}


# download checkpoint
hf_home = 'https://huggingface.co/FoundationVision/var/resolve/main'
vae_ckpt, var_ckpt = 'vae_ch160v4096z32.pth', f'var_d{MODEL_DEPTH}.pth'
if not osp.exists(vae_ckpt): os.system(f'wget {hf_home}/{vae_ckpt}')
if not osp.exists(var_ckpt): os.system(f'wget {hf_home}/{var_ckpt}')

# build vae, var
patch_nums = (1, 2, 3, 4, 5, 6, 8, 10, 13, 16)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if 'vae' not in globals() or 'var' not in globals():
    vae, var = build_vae_var(
        V=4096, Cvae=32, ch=160, share_quant_resi=4,    # hard-coded VQVAE hyperparameters
        device=device, patch_nums=patch_nums,
        num_classes=1000, depth=MODEL_DEPTH, shared_aln=False,
    )

# load checkpoints
vae.load_state_dict(torch.load(vae_ckpt, map_location='cpu'), strict=True)
var.load_state_dict(torch.load(var_ckpt, map_location='cpu'), strict=True)
vae.eval(), var.eval()
for p in vae.parameters(): p.requires_grad_(False)
for p in var.parameters(): p.requires_grad_(False)
print(f'prepare finished.')


[constructor]  ==== flash_if_available=True (0/16), fused_if_available=True (fusing_add_ln=0/16, fusing_mlp=0/16) ==== 
    [VAR config ] embed_dim=1024, num_heads=16, depth=16, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0666667 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0178, 0.0222, 0.0267, 0.0311, 0.0356,
        0.0400, 0.0444, 0.0489, 0.0533, 0.0578, 0.0622, 0.0667]))



  from .autonotebook import tqdm as notebook_tqdm


[init_weights] VAR with init_std=0.0180422
prepare finished.


In [2]:
############################# 2. Sample with classifier-free guidance

# set args
seed = 0 #@param {type:"number"}
torch.manual_seed(seed)
num_sampling_steps = 250 #@param {type:"slider", min:0, max:1000, step:1}
cfg = 4 #@param {type:"slider", min:1, max:10, step:0.1}
class_labels = (980, 980, 437, 437, 22, 22, 562, 562)  #@param {type:"raw"}
more_smooth = False # True for more smooth output

# seed
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# run faster
tf32 = True
torch.backends.cudnn.allow_tf32 = bool(tf32)
torch.backends.cuda.matmul.allow_tf32 = bool(tf32)
torch.set_float32_matmul_precision('high' if tf32 else 'highest')

# sample
B = len(class_labels)
label_B: torch.LongTensor = torch.tensor(class_labels, device=device)
with torch.inference_mode():
    with torch.autocast('cuda', enabled=True, dtype=torch.float16, cache_enabled=True):    # using bfloat16 can be faster
        recon_B3HW = var.autoregressive_infer_cfg(B=B, label_B=label_B, cfg=cfg, top_k=900, top_p=0.95, g_seed=seed, more_smooth=more_smooth)

chw = torchvision.utils.make_grid(recon_B3HW, nrow=8, padding=0, pad_value=1.0)
chw = chw.permute(1, 2, 0).mul_(255).cpu().numpy()
chw = PImage.fromarray(chw.astype(np.uint8))
chw.show()




In [3]:
var_checkpoint_path = 'var_d16.pth'

# Load the state dictionary directly, as identified previously
state_dict = torch.load(var_checkpoint_path, map_location='cpu')
var.load_state_dict(state_dict, strict=True)
print("Successfully loaded weights into the VAR model.")

# --- 4. Prepare for Inference ---
# Set the model to evaluation mode to disable dropout, etc.
var.eval()

# The model's parameters should not require gradients for inference
for p in var.parameters():
    p.requires_grad_(False)

print("Model is ready for inference.")

Successfully loaded weights into the VAR model.
Model is ready for inference.


In [24]:
print(var)

VAR(
  drop_path_rate=0.0666667
  (word_embed): Linear(in_features=32, out_features=1024, bias=True)
  (class_emb): Embedding(1001, 1024)
  (lvl_embed): Embedding(10, 1024)
  (shared_ada_lin): Identity()
  (blocks): ModuleList(
    (0): AdaLNSelfAttn(
      shared_aln=False
      (drop_path): Identity()
      (attn): SelfAttention(
        using_flash=False, using_xform=False, attn_l2_norm=True
        (mat_qkv): Linear(in_features=1024, out_features=3072, bias=False)
        (proj): Linear(in_features=1024, out_features=1024, bias=True)
        (proj_drop): Identity()
      )
      (ffn): FFN(
        fused_mlp_func=False
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (act): GELU(approximate='tanh')
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (drop): Identity()
      )
      (ln_wo_grad): LayerNorm((1024,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_featur

In [25]:
print(vae)

VQVAE(
  (encoder): Encoder(
    (conv_in): Conv2d(3, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (down): ModuleList(
      (0-1): 2 x Module(
        (block): ModuleList(
          (0-1): 2 x ResnetBlock(
            (norm1): GroupNorm(32, 160, eps=1e-06, affine=True)
            (conv1): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (norm2): GroupNorm(32, 160, eps=1e-06, affine=True)
            (dropout): Identity()
            (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (nin_shortcut): Identity()
          )
        )
        (attn): ModuleList()
        (downsample): Downsample2x(
          (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(2, 2))
        )
      )
      (2): Module(
        (block): ModuleList(
          (0): ResnetBlock(
            (norm1): GroupNorm(32, 160, eps=1e-06, affine=True)
            (conv1): Conv2d(160, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1,

In [5]:
for name, module in var.named_modules():
        print(f"Layer Type: {type(module)}")


Layer Type: <class 'models.var.VAR'>
Layer Type: <class 'torch.nn.modules.linear.Linear'>
Layer Type: <class 'torch.nn.modules.sparse.Embedding'>
Layer Type: <class 'torch.nn.modules.sparse.Embedding'>
Layer Type: <class 'torch.nn.modules.linear.Identity'>
Layer Type: <class 'torch.nn.modules.container.ModuleList'>
Layer Type: <class 'models.basic_var.AdaLNSelfAttn'>
Layer Type: <class 'torch.nn.modules.linear.Identity'>
Layer Type: <class 'models.basic_var.SelfAttention'>
Layer Type: <class 'torch.nn.modules.linear.Linear'>
Layer Type: <class 'torch.nn.modules.linear.Linear'>
Layer Type: <class 'torch.nn.modules.linear.Identity'>
Layer Type: <class 'models.basic_var.FFN'>
Layer Type: <class 'torch.nn.modules.linear.Linear'>
Layer Type: <class 'torch.nn.modules.activation.GELU'>
Layer Type: <class 'torch.nn.modules.linear.Linear'>
Layer Type: <class 'torch.nn.modules.linear.Identity'>
Layer Type: <class 'torch.nn.modules.normalization.LayerNorm'>
Layer Type: <class 'torch.nn.modules.co

In [6]:
for name, param in var.named_parameters():
    print("Parameter Name: " + name + ", Parameter Shape: " + str(param.shape))

Parameter Name: pos_start, Parameter Shape: torch.Size([1, 1, 1024])
Parameter Name: pos_1LC, Parameter Shape: torch.Size([1, 680, 1024])
Parameter Name: word_embed.weight, Parameter Shape: torch.Size([1024, 32])
Parameter Name: word_embed.bias, Parameter Shape: torch.Size([1024])
Parameter Name: class_emb.weight, Parameter Shape: torch.Size([1001, 1024])
Parameter Name: lvl_embed.weight, Parameter Shape: torch.Size([10, 1024])
Parameter Name: blocks.0.attn.scale_mul_1H11, Parameter Shape: torch.Size([1, 16, 1, 1])
Parameter Name: blocks.0.attn.q_bias, Parameter Shape: torch.Size([1024])
Parameter Name: blocks.0.attn.v_bias, Parameter Shape: torch.Size([1024])
Parameter Name: blocks.0.attn.mat_qkv.weight, Parameter Shape: torch.Size([3072, 1024])
Parameter Name: blocks.0.attn.proj.weight, Parameter Shape: torch.Size([1024, 1024])
Parameter Name: blocks.0.attn.proj.bias, Parameter Shape: torch.Size([1024])
Parameter Name: blocks.0.ffn.fc1.weight, Parameter Shape: torch.Size([4096, 1024]

In [10]:

import torch
import torch.nn as nn

# Define a custom initialization function
def custom_init(layer):
    if isinstance(layer, nn.Conv2d):
        nn.init.xavier_uniform_(layer.weight)
        if layer.bias is not None:
            nn.init.zeros_(layer.bias)

# Example neural network model with nested structures
class NestedCNN(nn.Module):
    def __init__(self):
        super(NestedCNN, self).__init__()
        self.conv_block1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.conv_block2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.fc_block = nn.Sequential(
            nn.Linear(in_features=64*7*7, out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=10)
        )

    def forward(self, x):
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = x.view(-1, 64*7*7)
        x = self.fc_block(x)
        return x

model = var
model.apply(custom_init)

# Function to recursively iterate over all layers
def iterate_layers(module):
    for name, layer in module.named_children():
        print(f"Layer Name: {name}, Layer Type: {type(layer)}")
        iterate_layers(layer)  # Recursively iterate over nested layers
iterate_layers(model)


Layer Name: word_embed, Layer Type: <class 'torch.nn.modules.linear.Linear'>
Layer Name: class_emb, Layer Type: <class 'torch.nn.modules.sparse.Embedding'>
Layer Name: lvl_embed, Layer Type: <class 'torch.nn.modules.sparse.Embedding'>
Layer Name: shared_ada_lin, Layer Type: <class 'torch.nn.modules.linear.Identity'>
Layer Name: blocks, Layer Type: <class 'torch.nn.modules.container.ModuleList'>
Layer Name: 0, Layer Type: <class 'models.basic_var.AdaLNSelfAttn'>
Layer Name: drop_path, Layer Type: <class 'torch.nn.modules.linear.Identity'>
Layer Name: attn, Layer Type: <class 'models.basic_var.SelfAttention'>
Layer Name: mat_qkv, Layer Type: <class 'torch.nn.modules.linear.Linear'>
Layer Name: proj, Layer Type: <class 'torch.nn.modules.linear.Linear'>
Layer Name: proj_drop, Layer Type: <class 'torch.nn.modules.linear.Identity'>
Layer Name: ffn, Layer Type: <class 'models.basic_var.FFN'>
Layer Name: fc1, Layer Type: <class 'torch.nn.modules.linear.Linear'>
Layer Name: act, Layer Type: <cl

In [22]:
### UNIT MEM MEASUREMENT
datapath = './data'
import torchvision.transforms as transforms
import torchvision
import torch
from torchvision.transforms import ToTensor, Normalize
import scipy.io
from tqdm import tqdm
import numpy as np
s = 1
color_jitter = transforms.ColorJitter(
        0.9 * s, 0.9 * s, 0.9 * s, 0.1 * s)
flip = transforms.RandomHorizontalFlip()
Aug = transforms.Compose(
    [
    transforms.RandomResizedCrop(size=32),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomApply([color_jitter], p=0.9),
    transforms.RandomGrayscale(p=0.1)
    ])
data_transforms = transforms.Compose(
            [
                ToTensor(),
                Normalize(0.5, 0.5)
            ])
CIFAR_10_Dataset = torchvision.datasets.CIFAR10(datapath, train=True, download=True,
                                                 transform=data_transforms)
sublist = list(range(0, 2, 1))
subset = torch.utils.data.Subset(CIFAR_10_Dataset, sublist)
dataloader = torch.utils.data.DataLoader(subset, 1, shuffle=False, num_workers=2)

#model.load_state_dict(torch.load('./var_d16.pth'))
new_m = torchvision.models._utils.IntermediateLayerGetter(var, {'head': 'feat1'})

final1 = []
if __name__ == '__main__':
    for img, label in tqdm(iter(dataloader)):
        final = []
        for j in range(10):
            out = new_m(Aug(img))
            for k, v in out.items():
                my = np.mean(v.reshape(256, 4).cpu().detach().numpy(), axis=1)
                final.append(my)
        out1 = np.mean(np.array(final), axis=0)
        final1.append(out1)

    finalout = np.array(final1)
    maxout = np.max(finalout, axis=0)
    medianout = np.median(np.sort(finalout, axis=0)[0:-1], axis=0)
    selectivity = (maxout - medianout)/(maxout + medianout)
    scipy.io.savemat('./data/selectivity_unit.mat', {'selectivity': selectivity})

    # Top 10% der Neuronen mit höchster Selectivity finden
    num_neurons = selectivity.shape[0]
    top_k = int(np.ceil(num_neurons * 0.1))
    top_indices = np.argsort(selectivity)[-top_k:][::-1]  # absteigend sortiert
    print(f"Top 10% Neuronen-Indizes: {top_indices}")
    # Optional: als .mat speichern
    scipy.io.savemat('./data/top10percent_unit_indices.mat', {'top10percent_indices': top_indices})
    

  0%|          | 0/2 [00:00<?, ?it/s]


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

In [20]:
model = var
model.apply(custom_init)

VAR(
  drop_path_rate=0.0666667
  (word_embed): Linear(in_features=32, out_features=1024, bias=True)
  (class_emb): Embedding(1001, 1024)
  (lvl_embed): Embedding(10, 1024)
  (shared_ada_lin): Identity()
  (blocks): ModuleList(
    (0): AdaLNSelfAttn(
      shared_aln=False
      (drop_path): Identity()
      (attn): SelfAttention(
        using_flash=False, using_xform=False, attn_l2_norm=True
        (mat_qkv): Linear(in_features=1024, out_features=3072, bias=False)
        (proj): Linear(in_features=1024, out_features=1024, bias=True)
        (proj_drop): Identity()
      )
      (ffn): FFN(
        fused_mlp_func=False
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (act): GELU(approximate='tanh')
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (drop): Identity()
      )
      (ln_wo_grad): LayerNorm((1024,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_featur

In [13]:
for name, module in model.named_modules():
    print(name)


word_embed
class_emb
lvl_embed
shared_ada_lin
blocks
blocks.0
blocks.0.drop_path
blocks.0.attn
blocks.0.attn.mat_qkv
blocks.0.attn.proj
blocks.0.attn.proj_drop
blocks.0.ffn
blocks.0.ffn.fc1
blocks.0.ffn.act
blocks.0.ffn.fc2
blocks.0.ffn.drop
blocks.0.ln_wo_grad
blocks.0.ada_lin
blocks.0.ada_lin.0
blocks.0.ada_lin.1
blocks.1
blocks.1.drop_path
blocks.1.attn
blocks.1.attn.mat_qkv
blocks.1.attn.proj
blocks.1.attn.proj_drop
blocks.1.ffn
blocks.1.ffn.fc1
blocks.1.ffn.act
blocks.1.ffn.fc2
blocks.1.ffn.drop
blocks.1.ln_wo_grad
blocks.1.ada_lin
blocks.1.ada_lin.0
blocks.1.ada_lin.1
blocks.2
blocks.2.drop_path
blocks.2.attn
blocks.2.attn.mat_qkv
blocks.2.attn.proj
blocks.2.attn.proj_drop
blocks.2.ffn
blocks.2.ffn.fc1
blocks.2.ffn.act
blocks.2.ffn.fc2
blocks.2.ffn.drop
blocks.2.ln_wo_grad
blocks.2.ada_lin
blocks.2.ada_lin.0
blocks.2.ada_lin.1
blocks.3
blocks.3.drop_path
blocks.3.attn
blocks.3.attn.mat_qkv
blocks.3.attn.proj
blocks.3.attn.proj_drop
blocks.3.ffn
blocks.3.ffn.fc1
blocks.3.ffn.act
