In [1]:
cd ~/workspace/mmf_nr/

/private/home/ronghanghu/workspace/mmf_nr


In [2]:
import numpy as np
import torch.utils.data as data
# from PIL import Image
from skimage.io import imread
from skimage.transform import resize
# from torchvision.transforms import Compose, Normalize, Resize, ToTensor
from mmf.datasets.builders.synsin_realestate10k.geometry import get_deltas


class RealEstate10K(data.Dataset):
    """ Dataset for loading the RealEstate10K. In this case, images are randomly
    chosen within a video subject to certain constraints: e.g. they should
    be within a number of frames but the angle and translation should
    vary as much as possible.
    """

    def __init__(
        self, dataset, opts=None, num_views=2, seed=0, vectorize=False
    ):
        self.imageset = np.loadtxt(
            opts.video_list,
            dtype=np.str,
        )

        if dataset == "train":
            self.imageset = self.imageset[0 : int(0.8 * self.imageset.shape[0])]
        else:
            self.imageset = self.imageset[int(0.8 * self.imageset.shape[0]) :]

        self.rng = np.random.RandomState(seed)
        self.base_file = opts.train_data_path

        self.num_views = num_views
        self.W = opts.W

        self.dataset = "train"

        self.ANGLE_THRESH = 5
        self.TRANS_THRESH = 0.15

    def __len__(self):
        return len(self.imageset)

    def __getitem__(self, index):
        # index = self.rng.randint(self.imageset.shape[0])
        # index = index % self.imageset.shape[0]
        # Load text file containing frame information
        frames = np.loadtxt(
            self.base_file + "/%s.txt" % self.imageset[index]
        )

        image_index = self.rng.choice(frames.shape[0], size=(1,))[0]

        # Chose 15 images within 30 frames of the iniital one
        image_indices = self.rng.randint(80, size=(30,)) - 40 + image_index
        image_indices = np.minimum(
            np.maximum(image_indices, 0), frames.shape[0] - 1
        )

        # Look at the change in angle and choose a hard one
        angles = []
        translations = []
        for viewpoint in range(0, image_indices.shape[0]):
            orig_viewpoint = frames[image_index, 7:].reshape(3, 4)
            new_viewpoint = frames[image_indices[viewpoint], 7:].reshape(3, 4)
            dang, dtrans = get_deltas(orig_viewpoint, new_viewpoint)

            angles += [dang]
            translations += [dtrans]

        angles = np.array(angles)
        translations = np.array(translations)

        mask = image_indices[
            (angles > self.ANGLE_THRESH) | (translations > self.TRANS_THRESH)
        ]

        all_intrinsics = []
        all_extrinsics = []
        video_inds = []
        frame_inds = []
        for i in range(0, self.num_views):
            if i == 0:
                t_index = image_index
            elif mask.shape[0] > 5:
                # Choose a harder angle change
                t_index = mask[self.rng.randint(mask.shape[0])]
            else:
                t_index = image_indices[
                    self.rng.randint(image_indices.shape[0])
                ]
            
            video_inds.append(self.imageset[index])
            frame_inds.append(str(int(frames[t_index, 0])))

            intrinsics = frames[t_index, 1:7] * self.W
            extrinsics = frames[t_index, 7:]

            all_intrinsics.append(intrinsics)
            all_extrinsics.append(extrinsics)

        return {
            "all_intrinsics": all_intrinsics,
            "all_extrinsics": all_extrinsics,
            "video_inds": video_inds,
            "frame_inds": frame_inds,
        }

In [3]:
class SynSinDatasetOption:
    def __init__(self, data_path, video_list, image_size):
        self.W = image_size
        self.train_data_path = data_path
        self.video_list = video_list

In [4]:
opts = SynSinDatasetOption(
    "/checkpoint/ronghanghu/neural_rendering_datasets/realestate10K/RealEstate10K/frames/train/",
    "/checkpoint/ronghanghu/neural_rendering_datasets/realestate10K/RealEstate10K/frames/train/video_loc.txt",
    256
)

dataset = RealEstate10K("val", opts)

In [5]:
lines = []
for idx in range(len(dataset)):
    entry = dataset[idx]
    assert entry['video_inds'][0] == entry['video_inds'][1]
    video_id = entry['video_inds'][0]
    frame_id_0, frame_id_1 = entry['frame_inds']
    assert np.all(entry['all_intrinsics'][0] == entry['all_intrinsics'][1])
    intrinsics = [str(x) for x in entry['all_intrinsics'][0][:4]]
    extrinsics_0 = [str(x) for x in entry['all_extrinsics'][0]]
    extrinsics_1 = [str(x) for x in entry['all_extrinsics'][1]]

    all_strs = [video_id, frame_id_0, frame_id_1] + intrinsics + extrinsics_0 + extrinsics_1
    line = ' '.join(all_strs) + '\n'
    lines.append(line)

In [6]:
# shuffle and take the first 5000
np.random.seed(3)
np.random.shuffle(lines)
val_lines = lines[:5000]
val_lines.sort()

with open("mmf/datasets/builders/synsin_realestate10k/realestate_val5000.txt", "w") as f:
    f.writelines(val_lines)