### Importing stuff

In [3]:
import os
from pathlib import Path
import itertools
from enum import Enum
import hashlib
import math
import pickle
import json
import asyncio
import aiohttp
import random
import progressbar

from matplotlib import pyplot as plt
import open3d as o3d
from open3d.visualization import draw_plotly
from mpl_toolkits.mplot3d import Axes3D

import einops
import einx
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

import torch
import torch.nn as nn
import torch.amp as amp
import torch.nn.utils as utils
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, Sampler, RandomSampler, SubsetRandomSampler, BatchSampler
import torchvision
from torchvision.io import read_image, ImageReadMode
from torchvision.utils import save_image
from torchinfo import summary
from torchcodec.decoders import VideoDecoder
import lightning as L
import lightning.pytorch as pl
import lightning.pytorch.callbacks as callbacks

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [None]:
from datasets.panoptic.dataset import PanopticDataset
from src.datasets.raw_plenoptic_dataset import RawPlenopticDataset
from src.datasets.full_dataset import FullDataset

from src.model.pose_encoder import compute_pad, compute_octaves, compute_view_rays

from src.config import load_config

from src.model import PoseEncoder, DVST, latent_aggregators

from src.draw import get_camera_geometry

from src.utils import preprocess_scene_videos


AttributeError: attribute '__isabstractmethod__' of 'property' objects is not writable

In [None]:
torch.__version__

'2.7.0+cu126'

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
device


'cuda'

### DVST Config

In [None]:
# To make it easier to pass around and validate configs
config = load_config('res/config.yaml')

config.setup.ddp.rank, config.setup.amp.dtype, config.setup.device

(0, torch.bfloat16, device(type='cuda', index=0))

### Loading datasets

Panoptic dataset

In [None]:
dataset_panoptic = PanopticDataset('res/tmp/panoptic/')
len(dataset_panoptic)

81

In [None]:
v = preprocess_scene_videos(dataset_panoptic[0], device)
v.videos[0]

{'video': <torchcodec.decoders._video_decoder.VideoDecoder at 0x7f24ea0a29f0>,
 'K': tensor([[1.4107e+03, 0.0000e+00, 9.6000e+02],
         [0.0000e+00, 1.3299e+03, 5.4000e+02],
         [0.0000e+00, 0.0000e+00, 1.0000e+00]], device='cuda:0'),
 'Kinv': tensor([[ 7.0888e-04,  0.0000e+00, -6.8053e-01],
         [ 0.0000e+00,  7.5194e-04, -4.0605e-01],
         [ 0.0000e+00,  0.0000e+00,  1.0000e+00]], device='cuda:0'),
 'R': tensor([[[-0.6212, -0.0284,  0.7832],
          [ 0.0751,  0.9926,  0.0955],
          [-0.7801,  0.1182, -0.6144]]], device='cuda:0'),
 't': tensor([[-15.3971, 117.3840, 288.2436]], device='cuda:0'),
 'time': tensor([0.0000e+00, 3.3367e-02, 6.6733e-02,  ..., 2.0254e+02, 2.0257e+02,
         2.0260e+02], device='cuda:0'),
 'shape': torch.Size([6073, 3, 1080, 1920])}

In [None]:
v, K, R, t2 = [v.videos[0][i] for i in ['video', 'K', 'R', 't']]
v, K, R, t2

(<torchcodec.decoders._video_decoder.VideoDecoder at 0x7f24ea0a29f0>,
 tensor([[1.4107e+03, 0.0000e+00, 9.6000e+02],
         [0.0000e+00, 1.3299e+03, 5.4000e+02],
         [0.0000e+00, 0.0000e+00, 1.0000e+00]], device='cuda:0'),
 tensor([[[-0.6212, -0.0284,  0.7832],
          [ 0.0751,  0.9926,  0.0955],
          [-0.7801,  0.1182, -0.6144]]], device='cuda:0'),
 tensor([[-15.3971, 117.3840, 288.2436]], device='cuda:0'))

Plenoptic dataset

In [None]:
dataset_plenoptic = RawPlenopticDataset('res/tmp/plenoptic/')
len(dataset_plenoptic)

6

In [None]:
v = preprocess_scene_videos(dataset_plenoptic[0], device)
v.videos[0]

{'video': <torchcodec.decoders._video_decoder.VideoDecoder at 0x7f235d410aa0>,
 'K': tensor([[1.4585e+03, 0.0000e+00, 1.3520e+03],
         [0.0000e+00, 1.4585e+03, 1.0140e+03],
         [0.0000e+00, 0.0000e+00, 1.0000e+00]], device='cuda:0'),
 'Kinv': tensor([[ 6.8564e-04,  0.0000e+00, -9.2698e-01],
         [ 0.0000e+00,  6.8564e-04, -6.9523e-01],
         [ 0.0000e+00,  0.0000e+00,  1.0000e+00]], device='cuda:0'),
 'R': tensor([[[-0.0272,  0.8776,  0.4786],
          [ 0.9996,  0.0286,  0.0042],
          [-0.0100,  0.4786, -0.8780]]], device='cuda:0', dtype=torch.float64),
 't': tensor([[ 5.4591, -1.0853,  0.6145]], device='cuda:0', dtype=torch.float64),
 'time': tensor([0.0000e+00, 3.3333e-02, 6.6667e-02,  ..., 3.9900e+01, 3.9933e+01,
         3.9967e+01], device='cuda:0'),
 'shape': torch.Size([1200, 3, 2028, 2704])}

Full dataset

In [None]:
dataset_full = FullDataset(config.train.data.datasets)
len(dataset_full)

87

In [None]:
v = preprocess_scene_videos(dataset_full[0], device)
v.sources[0], '', v.targets[0], '', v.queries[0], '', v.n_frames

({'video': <torchcodec.decoders._video_decoder.VideoDecoder at 0x7f231476a600>,
  'K': tensor([[1.4107e+03, 0.0000e+00, 9.6000e+02],
          [0.0000e+00, 1.3299e+03, 5.4000e+02],
          [0.0000e+00, 0.0000e+00, 1.0000e+00]], device='cuda:0'),
  'Kinv': tensor([[ 7.0888e-04,  0.0000e+00, -6.8053e-01],
          [ 0.0000e+00,  7.5194e-04, -4.0605e-01],
          [ 0.0000e+00,  0.0000e+00,  1.0000e+00]], device='cuda:0'),
  'R': tensor([[[-0.6212, -0.0284,  0.7832],
           [ 0.0751,  0.9926,  0.0955],
           [-0.7801,  0.1182, -0.6144]]], device='cuda:0'),
  't': tensor([[-15.3971, 117.3840, 288.2436]], device='cuda:0'),
  'time': tensor([0.0000e+00, 3.3367e-02, 6.6733e-02,  ..., 2.0254e+02, 2.0257e+02,
          2.0260e+02], device='cuda:0'),
  'shape': torch.Size([6073, 3, 1080, 1920])},
 '',
 <torchcodec.decoders._video_decoder.VideoDecoder at 0x7f23149f9f10>,
 '',
 {'video': <torchcodec.decoders._video_decoder.VideoDecoder at 0x7f23149f9f10>,
  'K': tensor([[1.4083e+03, 0

In [None]:
v = preprocess_scene_videos(dataset_full[84], device)
v.sources[0], '', v.targets[0], '', v.queries[0], '', v.n_frames

({'video': <torchcodec.decoders._video_decoder.VideoDecoder at 0x7f23149fa180>,
  'K': tensor([[1.4623e+03, 0.0000e+00, 1.3520e+03],
          [0.0000e+00, 1.4623e+03, 1.0140e+03],
          [0.0000e+00, 0.0000e+00, 1.0000e+00]], device='cuda:0'),
  'Kinv': tensor([[ 6.8386e-04,  0.0000e+00, -9.2457e-01],
          [ 0.0000e+00,  6.8386e-04, -6.9343e-01],
          [ 0.0000e+00,  0.0000e+00,  1.0000e+00]], device='cuda:0'),
  'R': tensor([[[ 0.0501,  0.9571, -0.2855],
           [ 0.9953, -0.0242,  0.0935],
           [ 0.0826, -0.2888, -0.9538]]], device='cuda:0', dtype=torch.float64),
  't': tensor([[-4.7647,  1.3549,  0.5705]], device='cuda:0', dtype=torch.float64),
  'time': tensor([0.0000, 0.0333, 0.0667, 0.1000, 0.1333, 0.1667, 0.2000, 0.2333, 0.2667,
          0.3000, 0.3333, 0.3667, 0.4000, 0.4333, 0.4667, 0.5000, 0.5333, 0.5667,
          0.6000, 0.6333, 0.6667, 0.7000, 0.7333, 0.7667, 0.8000, 0.8333, 0.8667,
          0.9000, 0.9333, 0.9667, 1.0000, 1.0333, 1.0667, 1.1000, 1.

### Pose encoder

Auxiliary functions

In [None]:
compute_pad([5, 4], 4)

([8, 4], (0, 0, 1, 2))

In [None]:
v = torch.zeros((3, 6, 2))
v[0, 0, 0] = 1
compute_octaves(v, n_oct=4, dim=-2)

tensor([[[-8.7423e-08,  0.0000e+00],
         [-1.0000e+00,  1.0000e+00],
         [ 1.7485e-07,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 3.4969e-07,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 6.9938e-07,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
 

In [None]:
import torch
import einx


vecs = torch.rand((3, 128, 128))
Kinv = torch.rand((3, 3))
R = torch.rand((4, 3, 3))
t = torch.rand((4, 3))

o, d = compute_view_rays(vecs, Kinv, R, t)

o.shape, d.shape

(torch.Size([4, 3, 128, 128]), torch.Size([4, 3, 128, 128]))

Pose encoder

In [None]:
B = 4
C = config.model.C
K = torch.linalg.inv(torch.arange(9).reshape((3, 3)) + 4.0)
Kinv = K.inverse()
R, t = torch.arange(B * 9).reshape((B, 3, 3)) + 0.0, torch.arange(B * 3).reshape((B, 3)) + 0.0
I = torch.ones((B, config.model.C, 5, 4)) + 0.0

pose_encoder = PoseEncoder(config.model)
pose_encoder(Kinv, R, t, torch.arange(B) / 4, I)[0].shape # (4, 2, 12)
#pose_encoder(Kinv, R, t, torch.arange(B) / 4, None, I.shape[-2:])

torch.Size([4, 1, 192])

Displaying view rays from compute_view_rays

In [None]:
scene = preprocess_scene_videos(dataset_panoptic[0], device)

In [None]:
scene.videos[0]

{'video': <torchcodec.decoders._video_decoder.VideoDecoder at 0x7f22ee098290>,
 'K': tensor([[1.4107e+03, 0.0000e+00, 9.6000e+02],
         [0.0000e+00, 1.3299e+03, 5.4000e+02],
         [0.0000e+00, 0.0000e+00, 1.0000e+00]], device='cuda:0'),
 'Kinv': tensor([[ 7.0888e-04,  0.0000e+00, -6.8053e-01],
         [ 0.0000e+00,  7.5194e-04, -4.0605e-01],
         [ 0.0000e+00,  0.0000e+00,  1.0000e+00]], device='cuda:0'),
 'R': tensor([[[-0.6212, -0.0284,  0.7832],
          [ 0.0751,  0.9926,  0.0955],
          [-0.7801,  0.1182, -0.6144]]], device='cuda:0'),
 't': tensor([[-15.3971, 117.3840, 288.2436]], device='cuda:0'),
 'time': tensor([0.0000e+00, 3.3367e-02, 6.6733e-02,  ..., 2.0254e+02, 2.0257e+02,
         2.0260e+02], device='cuda:0'),
 'shape': torch.Size([6073, 3, 1080, 1920])}

In [None]:
rays = [pose_encoder._compute_view_rays(s.Kinv, s.R[:1], s.t[:1], (0, 0, 0, 0), s.shape[-2:]) for s in scene.videos]
geometries = [i for o, d in rays for i in get_camera_geometry(o.cpu(), d.cpu(), d_multiplier=30)]

# Also using point cloud bc for some reason colors in lines are not working
draw_plotly(geometries)

In [None]:
[i[0] for i in dataset_panoptic.scenes[0]]

# it is the 170307_dance6 dataset
# The cameras match with the positions in the plot

['res/tmp/panoptic/170307_dance6/hdVideos/hd_00_03_r.mp4',
 'res/tmp/panoptic/170307_dance6/hdVideos/hd_00_00_r.mp4',
 'res/tmp/panoptic/170307_dance6/hdVideos/hd_00_05_r.mp4',
 'res/tmp/panoptic/170307_dance6/hdVideos/hd_00_06_r.mp4',
 'res/tmp/panoptic/170307_dance6/hdVideos/hd_00_02_r.mp4',
 'res/tmp/panoptic/170307_dance6/hdVideos/hd_00_07_r.mp4',
 'res/tmp/panoptic/170307_dance6/hdVideos/hd_00_04_r.mp4',
 'res/tmp/panoptic/170307_dance6/hdVideos/hd_00_01_r.mp4']

### DVST

Transformer

In [None]:
import torch
import torch.nn as nn
import src.model.transformer as t

a = t.Encoder(2, 32, 4, 4, True, 1e-5, 0.1, nn.GELU).to(torch.bfloat16).cuda()
a.forward(torch.rand([1, 32]).to(torch.bfloat16).cuda()).shape

torch.Size([1, 32])

DVST

In [None]:
model = DVST(config=config.model).to(device)


In [None]:
from src.utils import get_num_params

get_num_params(model)


Total params: 7.87M; Trainable params: 7.87M


In [None]:
s = preprocess_scene_videos(dataset_full[0], device)
s

{'sources': [{'video': <torchcodec.decoders._video_decoder.VideoDecoder at 0x7f22e15b6a80>,
   'K': tensor([[1.6547e+03, 0.0000e+00, 9.6000e+02],
           [0.0000e+00, 1.5388e+03, 5.4000e+02],
           [0.0000e+00, 0.0000e+00, 1.0000e+00]], device='cuda:0'),
   'Kinv': tensor([[ 6.0433e-04,  0.0000e+00, -5.8016e-01],
           [ 0.0000e+00,  6.4986e-04, -3.5093e-01],
           [ 0.0000e+00,  0.0000e+00,  1.0000e+00]], device='cuda:0'),
   'R': tensor([[[-0.5296, -0.0115,  0.8482],
            [ 0.6366,  0.6554,  0.4064],
            [-0.5606,  0.7552, -0.3397]]], device='cuda:0'),
   't': tensor([[ -5.6521,  81.6465, 378.2934]], device='cuda:0'),
   'time': tensor([0.0000e+00, 3.3367e-02, 6.6733e-02,  ..., 2.0254e+02, 2.0257e+02,
           2.0260e+02], device='cuda:0'),
   'shape': torch.Size([6073, 3, 1080, 1920])},
  {'video': <torchcodec.decoders._video_decoder.VideoDecoder at 0x7f22c960cb60>,
   'K': tensor([[1.4302e+03, 0.0000e+00, 9.6000e+02],
           [0.0000e+00, 1.353

Model gets out of memory really fast, so the grads should be computed after computing each target frame

In [None]:
s.n_frames = 20

for i in s.sources:
    i.video = i.video[:s.n_frames][:, :, :64, :64]
    i.shape = torch.Size((s.n_frames, 3, 64, 64))
for i in s.queries:
    i.video = i.video[:s.n_frames][:, :, :64, :64]
    i.shape = torch.Size((s.n_frames, 3, 64, 64))
for i in range(len(s.targets)):
    s.targets[i] = s.targets[i][:s.n_frames][:, :, :64, :64]

s.sources[0].video.shape

torch.Size([20, 3, 64, 64])

In [None]:
with amp.autocast(device_type=device, dtype=torch.bfloat16):
    out = model(s)
out[0].shape

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 7.62 GiB of which 83.00 MiB is free. Including non-PyTorch memory, this process has 5.83 GiB memory in use. Of the allocated memory 4.17 GiB is allocated by PyTorch, and 36.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
s.n_frames = 10

for i in s.sources:
    i.video = i.video[:s.n_frames][:, :, :64, :64]
    i.shape = torch.Size((s.n_frames, 3, 64, 64))
for i in s.queries:
    i.video = i.video[:s.n_frames][:, :, :64, :64]
    i.shape = torch.Size((s.n_frames, 3, 64, 64))
for i in range(len(s.targets)):
    s.targets[i] = s.targets[i][:s.n_frames][:, :, :64, :64]

s.sources[0].video.shape

In [None]:
with amp.autocast(device_type=device, dtype=torch.bfloat16):
    out = model(s)
out[0].shape

Perceptual loss with ConvNeXt-Tiny

In [None]:
import torch
from src.model.loss import PerceptualLoss

perceptual_loss = PerceptualLoss()

shape = (4, 3, 64, 64)
I = torch.rand(shape)
[
    perceptual_loss(I, I),
    perceptual_loss(torch.zeros(shape), torch.ones(shape)),
    perceptual_loss(torch.ones(shape), torch.zeros(shape)),
    perceptual_loss(torch.zeros(shape), torch.zeros(shape)),
    perceptual_loss(torch.ones(shape), torch.ones(shape)),
]


[tensor(0., grad_fn=<SumBackward0>),
 tensor(1.4473, grad_fn=<SumBackward0>),
 tensor(1.4473, grad_fn=<SumBackward0>),
 tensor(0., grad_fn=<SumBackward0>),
 tensor(0., grad_fn=<SumBackward0>)]

In [None]:
#TODO:
# configure transformer enc and dec layers
# add optimizations checkpointing mixed precision etc
# do first testing of model w small parameters and check how much the pc can handle of it
# create combinations of configs for small experiments


In [None]:
from src.datasets.panoptic.downloader import PanopticDownloader

d = PanopticDownloader(
    path='res/tmp/panoptic',
    scene_names_file='res/panoptic_scene_names.txt',
    use_cuda=True,
    cq_amount=23,
    resize_to=(-1, 256),
    n_scenes=None,
    n_views=8,
)
d.download()


Panoptic dataset already downloaded
