In [1]:
import math
import os
import gc
import sys
import time

from typing import Dict, List, Tuple

from numba import jit, njit
from pathlib import Path
from tqdm.notebook import tqdm

In [2]:
os.environ['MKL_DEBUG_CPU_TYPE'] = '5'

In [3]:
BASE_DIR = '/home/dmitry/projects/dfdc'
SRC_DIR = os.path.join(BASE_DIR, 'src')
DATA_DIR = os.path.join(BASE_DIR, 'data/dfdc-videos')
SAVE_DIR = os.path.join(BASE_DIR, 'data/dfdc-crops')

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import cv2

import torch
import torchvision

from torch import Tensor
from torchvision import ops

import nvidia.dali as dali
from nvidia.dali.plugin.pytorch import DALIGenericIterator

# src
sys.path.insert(0, SRC_DIR)
from sample.reader import VideoReader
from dataset.utils import read_labels

# Pytorch_Retinaface
sys.path.insert(0, os.path.join(BASE_DIR, 'Pytorch_Retinaface'))
from data import cfg_mnet
from layers.functions.prior_box import PriorBox
from models.retinaface import RetinaFace
from detect_utils import decode_batch, detect, load_model, postproc_detections
from utils.nms.py_cpu_nms import py_cpu_nms

In [5]:
@njit
def calc_axis(c0, c1, pad, cmax):
    c0 = max(0, c0 - pad)
    c1 = min(cmax, c1 + pad)
    return c0, c1, c1 - c0


@njit
def expand_bbox(bbox, pct):
    bbox = np.copy(bbox)
    bbox[:2] *= 1 - pct
    bbox[2:] *= 1 + pct
    return bbox


@njit
def crop_face(img, bbox, pad_pct=0.05, square=True):
    img_h, img_w, _ = img.shape
    
    if pad_pct > 0:
        bbox = expand_bbox(bbox, pad_pct)
        
    x0, y0, x1, y1 = bbox.astype(np.int16)
    
    if square:
        w, h = x1 - x0, y1 - y0
        if w > h:
            pad = (w - h) // 2
            y0, y1, h = calc_axis(y0, y1, pad, img_h)
        elif h > w:
            pad = (h - w) // 2
            x0, x1, w = calc_axis(x0, x1, pad, img_w)
    
    size = min(w, h)
    face = img[y0:y1, x0:x1][:size, :size]
    return face

In [6]:
def round_num_faces(num_faces, frac_thresh=0.25):
    avg = num_faces.mean()
    fraction, integral = np.modf(avg)
    rounded = integral if fraction < frac_thresh else integral + 1
    return int(rounded)

In [7]:
class VideoPipe(dali.pipeline.Pipeline):
    def __init__(self, filenames: List[str], seq_len=30, stride=10, 
                 batch_size=1, num_threads=1, device_id=0):
        super(VideoPipe, self).__init__(
            batch_size, num_threads, device_id, seed=3)
        self.input = dali.ops.VideoReader(
            device='gpu', filenames=filenames, 
            sequence_length=seq_len,
            shard_id=0, num_shards=1)

    def define_graph(self):
        output = self.input(name='reader')
        return output
    
    
def get_file_list(df: pd.DataFrame, start: int, end: int, 
                  base_dir:str=DATA_DIR) -> List[str]:
    path_fn = lambda row: os.path.join(base_dir, row.dir, row.name)
    return df.iloc[start:end].apply(path_fn, axis=1).values.tolist()


def build_data_iter(files: List[str]):
    pipe = VideoPipe(files)
    pipe.build()
    return DALIGenericIterator([pipe], ['images'], len(files))


def init_detector(cfg, weights, use_cpu=False):
    cfg['pretrain'] = False
    net = RetinaFace(cfg=cfg, phase='test')
    net = load_model(net, weights, use_cpu)
    net.eval()
    return net


def mkdirs(base_dir: str, chunk_dirs: List[str]) -> None:
    for chunk_dir in chunk_dirs:
        dir_path = os.path.join(base_dir, chunk_dir)
        if not os.path.isdir(dir_path):
            os.mkdir(dir_path)

In [8]:
def prepare_imgs(sample: np.ndarray) -> Tuple[Tensor, Tensor]:
    n, h, w, c = sample.shape
    
    imgs = sample.float()
    imgs -= torch.tensor([104, 117, 123], device=imgs.device)
    imgs = imgs.permute(0, 3, 1, 2)

    scale = torch.tensor([w, h, w, h])
    return imgs, scale


def detect(sample: Tensor, model, cfg: Dict[str,any], device: torch.device) -> Tensor:
    bs = cfg['batch_size']
    num_frames, height, width, ch = sample.shape
    imgs, scale = prepare_imgs(sample)

    priorbox = PriorBox(cfg, image_size=(height, width))
    priors = priorbox.forward().to(device)
    scale = scale.to(device)

    detections = []
    for start in range(0, num_frames, bs):
        end = start + bs
        imgs_batch = imgs[start:end] #.to(device)
        with torch.no_grad():
            loc, conf, landms = model(imgs_batch)
        imgs_batch, landms = None, None
        dets = postproc_detections(loc, conf, priors, scale, cfg)
        detections.append(dets)
        loc, conf = None, None
    
    return torch.cat(detections)


def postproc_detections(
        locations: Tensor, confidence: Tensor, priors: Tensor, scale: Tensor, 
        cfg: Dict[str,any], resize=1) -> Tensor:
    boxes = decode_batch(locations, priors, cfg['variance'])
    boxes = boxes * scale / resize
    scores = confidence[:, :, 1]
    num_frames = scores.shape[0]
    dets = [postproc_frame_gpu(boxes[i], scores[i]) 
            for i in range(num_frames)]
    return torch.cat(dets)


def postproc_frame_gpu(
        boxes: Tensor, scores: Tensor, score_thresh=0.75, 
        nms_thresh=0.4, top_k=500, keep_top_k=5) -> Tensor:
    inds = (scores > score_thresh).nonzero()
    if not inds.size(0):
        return torch.empty(1, 0, 5, device=boxes.device, dtype=torch.float32)
    else:
        inds = inds[0]
    boxes = boxes[inds]
    scores = scores[inds]

    # keep top-K before NMS
    scores, idxs = scores.sort(descending=True)
    scores, idxs = scores[:top_k], idxs[:top_k]
    boxes = boxes[idxs]

    # do NMS
    keep = torchvision.ops.nms(boxes, scores, nms_thresh)
    boxes = boxes[keep][:keep_top_k]
    scores = scores[keep][:keep_top_k]
    
    scores = scores.unsqueeze_(1)
    return torch.cat([boxes, scores], dim=1).unsqueeze_(0)

In [9]:
def prepare_data(
        start=0, end=None, 
        num_frames_fake=30, num_frames_real=120,
        use_cpu=False, bs=32, verbose=False,
        base_dir=BASE_DIR, data_dir=DATA_DIR, save_dir=SAVE_DIR):
    df = read_labels(data_dir)
    mkdirs(save_dir, df['dir'].unique())
    
    reader = VideoReader()
    device = torch.device("cpu" if use_cpu else "cuda")
    weights_mnet = os.path.join(base_dir, 'data/weights/mobilenet0.25_Final.pth')
    cfg = {**cfg_mnet, 'batch_size': bs}
    detector = init_detector(cfg, weights_mnet, use_cpu).to(device)
    
    if end is None:
        end = len(df)
        
    files = get_file_list(df, start, end)
    data_iter = build_data_iter(files)
        
    for idx, batch in tqdm(enumerate(data_iter), total=(end-start)):
        meta = df.iloc[idx]
        # fake = bool(meta['label'])
        
        sample_dir = os.path.join(save_dir, meta.dir, meta.name[:-4])
        if not os.path.isdir(sample_dir):
            os.mkdir(sample_dir)
        if verbose:
            t0 = time.time()
            
        num_frames = 30 # num_frames_fake if fake else num_frames_real
        
        images = batch[0]['images'].squeeze(0)
        detections = detect(images, detector, cfg_mnet, device)
        detections = detections.cpu().numpy()
        num_faces = np.array(list(map(len, detections)), dtype=np.uint8)
        max_faces_per_frame = round_num_faces(num_faces, frac_thresh=0.25)
        images = images.cpu().numpy()
    
        for f in range(num_frames):
            for det in detections[f][:max_faces_per_frame]:
                face = crop_face(images[f], det[:4])
                file_path = os.path.join(sample_dir, '%03d.png' % f)
                face = cv2.cvtColor(face, cv2.COLOR_RGB2BGR)
                # cv2.imwrite(file_path, face)
        detections = None
        # gc.collect()
        
        if verbose:
            t1 = time.time()
            print('[%6d][%.02f s] %s' % (idx, t1 - t0, sample_dir))
    print('DONE')

In [10]:
%%time
# base - 1:06
# gpu_nms - 1:04

gc.collect()
prepare_data(start=0, end=100, bs=30, verbose=True)

Loading pretrained model from /home/dmitry/projects/dfdc/data/weights/mobilenet0.25_Final.pth
remove prefix 'module.'
Missing keys:0
Unused checkpoint keys:0
Used keys:300


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[     0][1.01 s] /home/dmitry/projects/dfdc/data/dfdc-crops/dfdc_train_part_22/vkketnrfud
[     1][0.47 s] /home/dmitry/projects/dfdc/data/dfdc-crops/dfdc_train_part_22/tnsaqegyqt
[     2][0.48 s] /home/dmitry/projects/dfdc/data/dfdc-crops/dfdc_train_part_22/jcwkemycdm
[     3][0.48 s] /home/dmitry/projects/dfdc/data/dfdc-crops/dfdc_train_part_22/lnpsnoufkq
[     4][0.47 s] /home/dmitry/projects/dfdc/data/dfdc-crops/dfdc_train_part_22/rdfdbmyrqm
[     5][0.47 s] /home/dmitry/projects/dfdc/data/dfdc-crops/dfdc_train_part_22/otyrbsrkhn
[     6][0.47 s] /home/dmitry/projects/dfdc/data/dfdc-crops/dfdc_train_part_22/zmlpmfbryq
[     7][0.47 s] /home/dmitry/projects/dfdc/data/dfdc-crops/dfdc_train_part_22/sjhdwvfdbi
[     8][0.47 s] /home/dmitry/projects/dfdc/data/dfdc-crops/dfdc_train_part_22/npbreznxbl
[     9][0.47 s] /home/dmitry/projects/dfdc/data/dfdc-crops/dfdc_train_part_22/khoogmqdci
[    10][0.48 s] /home/dmitry/projects/dfdc/data/dfdc-crops/dfdc_train_part_22/zewjjvygcr
[    11][0

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 0 and 1 in dimension 1 at /opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/generic/THCTensorMath.cu:71