In [1]:
#TODO lazy hf dataset

# https://huggingface.co/docs/datasets/en/about_mapstyle_vs_iterable
#  search for yield
#  use this to lazily load the videos (in each iteration download the next one and return the current (that has already been downloaded or download too if not))
# https://huggingface.co/docs/datasets/en/video_load
# https://huggingface.co/docs/datasets/en/video_dataset
#  create video dataset
# https://huggingface.co/docs/datasets/en/about_map_batch
#  use to map transformations (resizing etc)
# https://github.com/iejMac/video2dataset
#  check to see how to paralelize the yield (and how to create it abstractly for any dataset of scenes with a list of videos, not just panoptic)
#  actually i think i can do that just using dataset.map batched + yield and dataset.take in streaming dataset, but you would need

# make a dataset that creates a uniform distribution of different video sizes/aspect ratios/cropping options
# then evaluate the model in these environments:
#  same size/aspect/cropping on entire dataset
#  same size/aspect/cropping for videos in a scene but varying for all scenes
#  varying size/aspect/cropping for all videos in all scenes


### Importing stuff

In [2]:
import os
from pathlib import Path
import itertools
from enum import Enum
import hashlib
import math
import pickle
import json
import asyncio
import aiohttp
import random
import progressbar

from matplotlib import pyplot as plt
import open3d as o3d
from open3d.visualization import draw_plotly
from mpl_toolkits.mplot3d import Axes3D

import einops
import einx
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

import torch
import torch.nn as nn
import torch.nn.utils as utils
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, Sampler, RandomSampler, SubsetRandomSampler, BatchSampler
import torchvision
from torchvision.io import read_image, ImageReadMode
from torchvision.utils import save_image
from torchinfo import summary
from torchcodec.decoders import VideoDecoder
import lightning as L
import lightning.pytorch as pl
import lightning.pytorch.callbacks as callbacks

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [None]:
from src.panoptic_dataset import PanopticDataset
from src.plenoptic_dataset import PlenopticDataset

from src.model.pose_encoder import compute_pad, compute_octaves

from src.config import load_config

from src.model import PoseEncoder, DVST, latent_aggregators

from src.draw import get_camera_geometry


Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [4]:
torch.__version__

'2.7.0+cu126'

In [1]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
device


NameError: name 'torch' is not defined

### Loading datasets

Panoptic dataset

In [6]:
dataset_panoptic = PanopticDataset('res/tmp/panoptic/')

In [7]:
v = dataset_panoptic.__getitem__(0)
v[0]

{'video': <torchcodec.decoders._video_decoder.VideoDecoder at 0x7fd76e14bfe0>,
 'K': tensor([[1.4107e+03, 0.0000e+00, 9.6000e+02],
         [0.0000e+00, 1.3299e+03, 5.4000e+02],
         [0.0000e+00, 0.0000e+00, 1.0000e+00]]),
 'Kinv': tensor([[ 7.0888e-04,  0.0000e+00, -6.8053e-01],
         [ 0.0000e+00,  7.5194e-04, -4.0605e-01],
         [ 0.0000e+00,  0.0000e+00,  1.0000e+00]]),
 'R': tensor([[[-0.6212, -0.0284,  0.7832],
          [ 0.0751,  0.9926,  0.0955],
          [-0.7801,  0.1182, -0.6144]]]),
 't': tensor([[-15.3971, 117.3840, 288.2436]]),
 'time': tensor([0.0000e+00, 3.3367e-02, 6.6733e-02,  ..., 2.0254e+02, 2.0257e+02,
         2.0260e+02]),
 'shape': [6073, 3, 1080, 1920]}

In [8]:
v, K, R, t2 = [v[0][i] for i in ['video', 'K', 'R', 't']]
v, K, R, t2

(<torchcodec.decoders._video_decoder.VideoDecoder at 0x7fd76e14bfe0>,
 tensor([[1.4107e+03, 0.0000e+00, 9.6000e+02],
         [0.0000e+00, 1.3299e+03, 5.4000e+02],
         [0.0000e+00, 0.0000e+00, 1.0000e+00]]),
 tensor([[[-0.6212, -0.0284,  0.7832],
          [ 0.0751,  0.9926,  0.0955],
          [-0.7801,  0.1182, -0.6144]]]),
 tensor([[-15.3971, 117.3840, 288.2436]]))

Plenoptic dataset

In [9]:
dataset_plenoptic = PlenopticDataset('res/tmp/plenoptic/')

In [10]:
v = dataset_plenoptic.__getitem__(0)
v[0]

{'video': <torchcodec.decoders._video_decoder.VideoDecoder at 0x7fd76e14bf20>,
 'K': tensor([[1.4585e+03, 0.0000e+00, 1.3520e+03],
         [0.0000e+00, 1.4585e+03, 1.0140e+03],
         [0.0000e+00, 0.0000e+00, 1.0000e+00]]),
 'Kinv': tensor([[ 6.8564e-04,  0.0000e+00, -9.2698e-01],
         [ 0.0000e+00,  6.8564e-04, -6.9523e-01],
         [ 0.0000e+00,  0.0000e+00,  1.0000e+00]]),
 'R': tensor([[[-0.0272,  0.8776,  0.4786],
          [ 0.9996,  0.0286,  0.0042],
          [-0.0100,  0.4786, -0.8780]]], dtype=torch.float64),
 't': tensor([[ 5.4591, -1.0853,  0.6145]], dtype=torch.float64),
 'time': tensor([0.0000e+00, 3.3333e-02, 6.6667e-02,  ..., 3.9900e+01, 3.9933e+01,
         3.9967e+01]),
 'shape': [1200, 3, 2028, 2704]}

### DVST Config

In [11]:
# To make it easier to pass around and validate configs
config = load_config('res/config.yaml')

### Pose encoder

Auxiliary functions

In [12]:
compute_pad([5, 4], 4)

([8, 4], (0, 0, 1, 2))

In [13]:
v = torch.zeros((3, 6, 2))
v[0, 0, 0] = 1
compute_octaves(v, n_oct=4, dim=-2)

tensor([[[-8.7423e-08,  0.0000e+00],
         [-1.0000e+00,  1.0000e+00],
         [ 1.7485e-07,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 3.4969e-07,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 6.9938e-07,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 1.0000e+00,  1.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
 

Pose encoder

In [14]:
B = 4
C = config.model.C
K = torch.linalg.inv(torch.arange(9).reshape((3, 3)) + 4.0)
Kinv = K.inverse()
R, t = torch.arange(B * 9).reshape((B, 3, 3)) + 0.0, torch.arange(B * 3).reshape((B, 3)) + 0.0
I = torch.ones((B, config.model.C, 5, 4)) + 0.0

pose_encoder = PoseEncoder(config)
pose_encoder(Kinv, R, t, torch.arange(B) / 4, I)[0].shape # (4, 2, 12)
#pose_encoder(Kinv, R, t, torch.arange(B) / 4, None, I.shape[-2:])

torch.Size([4, 1, 192])

Displaying view rays from compute_view_rays

In [15]:
cams = dataset_panoptic.__getitem__(0)

In [16]:
cams[0]

{'video': <torchcodec.decoders._video_decoder.VideoDecoder at 0x7fd76ccdcc20>,
 'K': tensor([[1.4107e+03, 0.0000e+00, 9.6000e+02],
         [0.0000e+00, 1.3299e+03, 5.4000e+02],
         [0.0000e+00, 0.0000e+00, 1.0000e+00]]),
 'Kinv': tensor([[ 7.0888e-04,  0.0000e+00, -6.8053e-01],
         [ 0.0000e+00,  7.5194e-04, -4.0605e-01],
         [ 0.0000e+00,  0.0000e+00,  1.0000e+00]]),
 'R': tensor([[[-0.6212, -0.0284,  0.7832],
          [ 0.0751,  0.9926,  0.0955],
          [-0.7801,  0.1182, -0.6144]]]),
 't': tensor([[-15.3971, 117.3840, 288.2436]]),
 'time': tensor([0.0000e+00, 3.3367e-02, 6.6733e-02,  ..., 2.0254e+02, 2.0257e+02,
         2.0260e+02]),
 'shape': [6073, 3, 1080, 1920]}

In [17]:
rays = [pose_encoder._compute_view_rays(cam['Kinv'], cam['R'][:1], cam['t'][:1], (0, 0, 0, 0), cam['shape'][-2:]) for cam in cams]
geometries = [i for o, d in rays for i in get_camera_geometry(o, d, d_multiplier=30)]

# Also using point cloud bc for some reason colors in lines are not working
draw_plotly(geometries)

In [18]:
[i[0] for i in dataset_panoptic.data[0]]

# it is the 170307_dance6 dataset
# The cameras match with the positions in the plot

['res/tmp/panoptic/170307_dance6/hdVideos/hd_00_03_r.mp4',
 'res/tmp/panoptic/170307_dance6/hdVideos/hd_00_00_r.mp4',
 'res/tmp/panoptic/170307_dance6/hdVideos/hd_00_05_r.mp4',
 'res/tmp/panoptic/170307_dance6/hdVideos/hd_00_06_r.mp4',
 'res/tmp/panoptic/170307_dance6/hdVideos/hd_00_02_r.mp4',
 'res/tmp/panoptic/170307_dance6/hdVideos/hd_00_07_r.mp4',
 'res/tmp/panoptic/170307_dance6/hdVideos/hd_00_04_r.mp4',
 'res/tmp/panoptic/170307_dance6/hdVideos/hd_00_01_r.mp4']

### DVST

Transformer

In [1]:
import torch
import torch.nn as nn
import src.transformer as t

a = t.Encoder(2, 32, 4, 4, True, 1e-5, nn.GELU, 0.1).to(torch.bfloat16).cuda()
a.forward(torch.rand([1, 32]).to(torch.bfloat16).cuda()).shape

torch.Size([1, 32])

DVST

In [None]:
model = DVST(config=config)

In [20]:
#TODO:
# configure transformer enc and dec layers
# add optimizations checkpointing mixed precision etc
# do first testing of model w small parameters and check how much the pc can handle of it
# create combinations of configs for small experiments
