#### Load dependencies

In [1]:
import numpy as np
import rembg
import torch
from diffusers import DiffusionPipeline
from huggingface_hub import hf_hub_download
from PIL import Image
from torchvision.transforms import v2
from src.models.lrm_mesh import InstantMesh
from src.utils.camera_util import get_zero123plus_input_cameras
from src.utils.mesh_util import save_obj_with_mtl

device = torch.device("cuda")

  from .autonotebook import tqdm as notebook_tqdm


#### Load multi-view diffusion model

In [2]:
pipeline = DiffusionPipeline.from_pretrained(
    "sudo-ai/zero123plus-v1.2",
    custom_pipeline="zero123plus",
    torch_dtype=torch.float16
)

Loading pipeline components...: 100%|██████████| 8/8 [00:01<00:00,  6.69it/s]


#### Load custom white-background weights

In [3]:
custom_ckpt_path = hf_hub_download(
    repo_id="TencentARC/InstantMesh",
    filename="diffusion_pytorch_model.bin",
    repo_type="model"
)
state_dict = torch.load(custom_ckpt_path, map_location="cpu")
pipeline.unet.load_state_dict(state_dict, strict=True)
pipeline = pipeline.to(device)

  state_dict = torch.load(custom_ckpt_path, map_location="cpu")


#### Load LRM

In [4]:
model_ckpt_path = hf_hub_download(
    repo_id="TencentARC/InstantMesh",
    filename="instant_mesh_large.ckpt",
    repo_type="model"
)
model = InstantMesh(
    encoder_feat_dim=768,
    encoder_freeze=False,
    encoder_model_name="facebook/dino-vitb16",
    transformer_dim=1024,
    transformer_layers=16,
    transformer_heads=16,
    triplane_low_res=32,
    triplane_high_res=64,
    triplane_dim=80,
    rendering_samples_per_ray=128,
    grid_res=128,
    grid_scale=2.1
)
state_dict = torch.load(model_ckpt_path, map_location="cpu")["state_dict"]
state_dict = {
    k[14:]: v
    for k, v in state_dict.items()
    if k.startswith("lrm_generator.") and "source_camera" not in k
}
model.load_state_dict(state_dict, strict=True)
model = model.to(device)
model.init_flexicubes_geometry(device, fovy=30.0)
model = model.eval()

Some weights of ViTModel were not initialized from the model checkpoint at facebook/dino-vitb16 and are newly initialized: ['encoder.layer.0.adaLN_modulation.1.bias', 'encoder.layer.0.adaLN_modulation.1.weight', 'encoder.layer.1.adaLN_modulation.1.bias', 'encoder.layer.1.adaLN_modulation.1.weight', 'encoder.layer.10.adaLN_modulation.1.bias', 'encoder.layer.10.adaLN_modulation.1.weight', 'encoder.layer.11.adaLN_modulation.1.bias', 'encoder.layer.11.adaLN_modulation.1.weight', 'encoder.layer.2.adaLN_modulation.1.bias', 'encoder.layer.2.adaLN_modulation.1.weight', 'encoder.layer.3.adaLN_modulation.1.bias', 'encoder.layer.3.adaLN_modulation.1.weight', 'encoder.layer.4.adaLN_modulation.1.bias', 'encoder.layer.4.adaLN_modulation.1.weight', 'encoder.layer.5.adaLN_modulation.1.bias', 'encoder.layer.5.adaLN_modulation.1.weight', 'encoder.layer.6.adaLN_modulation.1.bias', 'encoder.layer.6.adaLN_modulation.1.weight', 'encoder.layer.7.adaLN_modulation.1.bias', 'encoder.layer.7.adaLN_modulation.1.w

#### Open input

In [5]:
input_filename = "examples/A_cartoon_house_with_red_roof.jpg"
input_image = Image.open(input_filename)

#### Remove background

In [6]:
input_image_rembg = rembg.remove(input_image)
input_image_rembg.save("tmp/input_image_rembg.png")

#### Crop and center foreground

In [7]:
image_numpy = np.array(input_image_rembg)
alpha = np.where(image_numpy[..., 3] > 0)
y1, y2, x1, x2 = (
    alpha[0].min(), 
    alpha[0].max(), 
    alpha[1].min(), 
    alpha[1].max()
)
fg = image_numpy[y1:y2, x1:x2]
size = max(fg.shape[0], fg.shape[1])
ph0, pw0 = (size - fg.shape[0]) // 2, (size - fg.shape[1]) // 2
ph1, pw1 = size - fg.shape[0] - ph0, size - fg.shape[1] - pw0
input_image_fg = np.pad(
    fg,
    ((ph0, ph1), (pw0, pw1), (0, 0)),
    mode="constant",
    constant_values=((0, 0), (0, 0), (0, 0)),
)

new_size = int(input_image_fg.shape[0] / 0.85)
ph0, pw0 = (new_size - size) // 2, (new_size - size) // 2
ph1, pw1 = new_size - size - ph0, new_size - size - pw0
input_image_fg = np.pad(
    input_image_fg,
    ((ph0, ph1), (pw0, pw1), (0, 0)),
    mode="constant",
    constant_values=((0, 0), (0, 0), (0, 0)),
)
input_image_fg = Image.fromarray(input_image_fg)
input_image_fg.save("tmp/input_image_fg.png")

#### Run multi-view diffusion

In [8]:
output_image = pipeline(input_image, num_inference_steps=75).images[0]
output_image.save("tmp/output_image.png")

100%|██████████| 75/75 [00:07<00:00,  9.72it/s]


#### Split views

In [9]:
images = np.asarray(output_image, dtype=np.float32) / 255.0
images = (
    torch.from_numpy(images).permute(2, 0, 1).contiguous().float()
)

n, m = 3, 2
c, h, w = images.shape
images = images.view(c, n, h // n, m, w // m).permute(1, 3, 0, 2, 4).contiguous()
images = images.view(n * m, c, h // n, w // m)

for i, image in enumerate(images):
    image = image.permute(1, 2, 0).numpy()
    image = (image * 255).astype(np.uint8)
    image = Image.fromarray(image)
    image.save(f"tmp/output_image_{i}.png")

#### Prepare inputs

In [10]:
input_cameras = get_zero123plus_input_cameras().to(device)

images_processed = images.unsqueeze(0).to(device)
images_processed = v2.functional.resize(
    images_processed, 
    320, 
    interpolation=3,
    antialias=True
).clamp(0, 1)

torch.save(images_processed, "tmp/images_processed.pt")

#### Run LRM

In [11]:
with torch.no_grad():
    planes = model.forward_planes(images_processed, input_cameras)
    mesh_path = "tmp/mesh.obj"
    mesh_out = model.extract_mesh(
        planes,
        use_texture_map=True,
        texture_resolution=1024,
    )
    vertices, faces, uvs, mesh_tex_idx, tex_map = mesh_out
    save_obj_with_mtl(
        vertices.data.cpu().numpy(),
        uvs.data.cpu().numpy(),
        faces.data.cpu().numpy(),
        mesh_tex_idx.data.cpu().numpy(),
        tex_map.permute(1, 2, 0).data.cpu().numpy(),
        mesh_path,
    )