In [None]:
!git clone https://github.com/zllrunning/face-parsing.PyTorch

In [None]:
import sys
sys.path.append('./face-parsing.PyTorch')

In [None]:
!pip install gdown

In [None]:
import gdown

url = 'https://drive.google.com/uc?id=154JgKpzCPW82qINcVieuPH3fZ2e0P812'
output = 'weights.pth'  
gdown.download(url, output, quiet=False)

In [None]:
def vis_parsing_maps_for_video(im, parsing_anno, stride, grid_size):
    part_colors = [[0, 0, 0] for i in range(24)]

    # red skin
    part_colors[0] = [0, 0, 175]
    part_colors[6] = [0, 0, 175]
    part_colors[7] = [0, 0, 175]
    part_colors[9] = [0, 0, 175]
    part_colors[13] = [0, 0, 175]

    #red lips
    part_colors[11] = [0, 0, 175]
    part_colors[12] = [0, 0, 175]

    #blue background
    part_colors[23] = [255, 0, 0]


    im = np.array(im)
    vis_im = im.copy().astype(np.uint8)
    original_vis_im = vis_im.copy()
    vis_parsing_anno = parsing_anno.copy().astype(np.uint8)
    vis_parsing_anno = cv2.resize(vis_parsing_anno, None, fx=stride, fy=stride, interpolation=cv2.INTER_NEAREST)
    vis_parsing_anno_color = np.zeros((vis_parsing_anno.shape[0], vis_parsing_anno.shape[1], 3)) + 255

    num_of_class = np.max(vis_parsing_anno)

    for pi in range(1, num_of_class + 1):
        index = np.where(vis_parsing_anno == pi)
        vis_parsing_anno_color[index[0], index[1], :] = part_colors[pi]

    h, w = vis_parsing_anno.shape
    h_step = h // grid_size
    w_step = w // grid_size

    r = np.zeros_like(vis_parsing_anno).astype(np.uint8)
    g = np.zeros_like(vis_parsing_anno).astype(np.uint8)
    b = np.zeros_like(vis_parsing_anno).astype(np.uint8)

    # Overlay diagonal grid lines and color them
    for y in range(0, h-h_step, h_step):
        for x in range(0, w-w_step, w_step):
            # Diagonal from top-left to bottom-right of each cell
            for d in range(min(h_step, w_step)):
                segment = vis_parsing_anno[y+d, x+d]
                dominant_label = segment
                r[y+d, x+d] = part_colors[dominant_label - 1][0]
                g[y+d, x+d] = part_colors[dominant_label - 1][1]
                b[y+d, x+d] = part_colors[dominant_label - 1][2]

            # Diagonal from top-right to bottom-left of each cell
            for d in range(min(h_step, w_step)):
                segment = vis_parsing_anno[y+d, x+w_step-d]
                dominant_label = segment
                r[y+d, x+w_step-d] = part_colors[dominant_label - 1][0]
                g[y+d, x+w_step-d] = part_colors[dominant_label - 1][1]
                b[y+d, x+w_step-d] = part_colors[dominant_label - 1][2]


    rgb = np.stack([r, g, b], axis=2)

    vis_im_bgr = cv2.cvtColor(cv2.cvtColor(vis_im, cv2.COLOR_RGB2GRAY), cv2.COLOR_GRAY2RGB)
    weight_dis = 0.7
    vis_im = cv2.addWeighted(vis_im_bgr, weight_dis, rgb, 1.0 - weight_dis, 0)

    combined_image = np.hstack((original_vis_im, vis_im))

    return combined_image

In [None]:
import cv2
import numpy as np

def vis_parsing_maps_for_video(im, parsing_anno, stride, grid_size):
    # Define part colors using a dictionary
    part_colors = {
        'skin': [0, 0, 175],
        'lips': [0, 0, 175],
        'background': [255, 0, 0],
    }
    color_map = [[0, 0, 0] for _ in range(24)]
    for idx in [0, 6, 7, 9, 13, 11, 12]:  # skin and lips
        color_map[idx] = part_colors['skin']
    color_map[23] = part_colors['background']

    vis_im = np.array(im, dtype=np.uint8)
    original_vis_im = vis_im.copy()
    vis_parsing_anno = cv2.resize(parsing_anno, None, fx=stride, fy=stride, interpolation=cv2.INTER_NEAREST)

    vis_parsing_anno_color = np.array([color_map[val] for val in vis_parsing_anno.flat]).reshape(vis_parsing_anno.shape + (3,))

    def overlay_diagonal(y, x, dy, dx):
        for d in range(min(h_step, w_step)):
            segment = vis_parsing_anno[y + d*dy, x + d*dx]
            r[y + d*dy, x + d*dx], g[y + d*dy, x + d*dx], b[y + d*dy, x + d*dx] = color_map[segment - 1]

    h, w = vis_parsing_anno.shape
    h_step, w_step = h // grid_size, w // grid_size

    r, g, b = [np.zeros_like(vis_parsing_anno, dtype=np.uint8) for _ in range(3)]

    for y in range(0, h - h_step, h_step):
        for x in range(0, w - w_step, w_step):
            overlay_diagonal(y, x, 1, 1)  # top-left to bottom-right
            overlay_diagonal(y, x + w_step, 1, -1)  # top-right to bottom-left

    vis_im_bgr = cv2.cvtColor(cv2.cvtColor(vis_im, cv2.COLOR_RGB2GRAY), cv2.COLOR_GRAY2RGB)
    rgb = np.stack([r, g, b], axis=2)
    weight_dis = 0.7
    vis_im = cv2.addWeighted(vis_im_bgr, weight_dis, rgb, 1.0 - weight_dis, 0)

    combined_image = np.hstack((original_vis_im, vis_im))

    return combined_image


In [None]:
import torch
import torchvision.transforms as transforms
import cv2
from model import BiSeNet
from PIL import Image
import os
import os.path as osp
import numpy as np
from tqdm import tqdm

# Check for CUDA availability and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")


dspth='./'

fourcc = cv2.VideoWriter_fourcc(*'mp4v')

n_classes = 19
net = BiSeNet(n_classes=n_classes)
net.to(device)
net.load_state_dict(torch.load('weights.pth', map_location=torch.device(device)))
net.eval()

to_tensor = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])
with torch.no_grad():
    grid_sizes = [100, 50, 25, 10]  # Define grid sizes

    for image_path in os.listdir(dspth):
        if image_path.lower().endswith(('.jpg', '.png')):
          img = Image.open(osp.join(dspth, image_path))
          image = img.resize((512, 512), Image.BILINEAR)
          img = to_tensor(image)
          img = torch.unsqueeze(img, 0)
          img = img.to(device)
          out = net(img)[0]
          parsing = out.squeeze(0).cpu().numpy().argmax(0)

          out_vid = cv2.VideoWriter('result.mp4', fourcc, 1.0, (1024, 512))

          for grid in tqdm(grid_sizes):
              vis_im = vis_parsing_maps_for_video(image, parsing, stride=1, grid_size=grid)
              # Add the same image to the video for 2 seconds (given 1 FPS)
              out_vid.write(vis_im)
              out_vid.write(vis_im)

        out_vid.release()

In [None]:
from IPython.core.display import display, HTML

video_path = "result.mp4"
video_tag = f"""
<video controls src="{video_path}" width="1024" height="512"/>
"""

display(HTML(video_tag))
