In [108]:
import os
import time
from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2

np.set_printoptions(suppress=True, linewidth=300)

### Organize processed files

In [4]:
# Open dataframe and set filename as index
df = pd.read_pickle('master_dataframe_updated.pkl')
df.set_index('file', inplace=True)

# Get list of all numpy files with faces detected
npy = list(Path('/media/mc/2TBNVMESSD/train_bboxes/').glob('*.npy'))

# Get list of real and fake numpy files
names = [i.with_suffix('.mp4').name for i in npy]
real = []
fake = []
for name in names:
    label = df.loc[name, 'label']
    if label == 'REAL': real.append(name)
    elif label == 'FAKE': fake.append(name)
    else: print('Unknown label: {}')
print(f'Real: {len(real)}, Fake: {len(fake)}')

# Make dataframe of npy info
npy_df = pd.DataFrame(list(zip(names, npy)), columns=['file', 'npy_file'])
npy_df.set_index('file', inplace=True)
for name in names:
    npy_df.loc[name, 'label'] = df.loc[name, 'label']
    data = np.load(npy_df.loc[name, 'npy_file'], allow_pickle=True)
    npy_df.loc[name, 'num_dets'] = np.array([i.shape[0] for i in data]).mean()

# Make dataframe organizing processed files
proc_df = pd.DataFrame(list(zip(real, [[] for i in real])), columns=['real', 'fakes'])
proc_df.set_index('real', inplace=True)
for name in fake:
    original = df.loc[name, 'original']
    if original in proc_df.index:
        proc_df.loc[original, 'fakes'].append(name)

Real: 5119, Fake: 22095


In [29]:
# Get real videos with one detection
files = npy_df[(npy_df['num_dets'] == 1.0) & (npy_df['label'] == 'REAL')].index
print(len(files))

proc_df = proc_df[proc_df.index.isin(files)]
print(len(proc_df))

test_set = proc_df.sample(frac=.2)
print(len(test_set))
train_set = proc_df[~proc_df.index.isin(test_set.index)]
print(len(train_set))
print(len(test_set) + len(train_set))

4463
4463
893
3570
4463


In [48]:
real_train_files = train_set.index
real_test_files = test_set.index
fake_train_files = np.concatenate(train_set.fakes.to_numpy())
fake_test_files = np.concatenate(test_set.fakes.to_numpy())
print(len(real_train_files), len(real_test_files), len(fake_train_files), len(fake_test_files))

3570 893 14991 3528


In [103]:
def get_det_from_video(video, boxes):
    cap = cv2.VideoCapture(video)
    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    assert num_frames == len(boxes)
    frames = []
    for (x1, y1, x2, y2) in boxes:
        ret, frame = cap.read()
        if frame is None: break
        frames.append(frame[y1:y2, x1:x2])
    return np.stack(frames)

def save_frames(frames, out_path):
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    fps = 30
    w = frames.shape[2]
    h = frames.shape[1]
    out = cv2.VideoWriter(out_path, fourcc, fps, (w, h))
    for frame in frames:
        out.write(frame)
    out.release()

def convert_xyxy2xywh(det):
    # numpy array with shape (n, 4)
    x = (det[:, 0] + det[:, 2]) / 2
    y = (det[:, 1] + det[:, 3]) / 2
    w = det[:, 2] - det[:, 0]
    h = det[:, 3] - det[:, 1]
    return np.column_stack((x, y, w, h))

def standardize_wh(det, method='max'):
    # numpy array with shape (n, 4)
    # method is 'max', 'min', 'mean'
    if method == 'max':
        det[:, 2] = det[:, 2].max()
        det[:, 3] = det[:, 3].max()
    elif method == 'min':
        det[:, 2] = det[:, 2].min()
        det[:, 3] = det[:, 3].min()
    elif method == 'mean':
        det[:, 2] = det[:, 2].mean()
        det[:, 3] = det[:, 3].mean()
    return det

def convert_xywh2xyxy(det):
    # numpy array with shape (n, 4)
    x1 = det[:, 0] - (det[:, 2] / 2)
    x2 = det[:, 0] + (det[:, 2] / 2)
    y1 = det[:, 1] - (det[:, 3] / 2)
    y2 = det[:, 1] + (det[:, 3] / 2)
    return np.column_stack((x1, y1, x2, y2))

def standardize_boxes(det, method='max'):
    det = convert_xyxy2xywh(det.astype(int))
    det = standardize_wh(det.astype(int), method)
    det = convert_xywh2xyxy(det.astype(int))
    return det

In [104]:
def save_extracted_faces(file, out_dir):
    out_path = str(Path(out_dir) / Path(file))
    video = df.loc[file, 'filepath']
    boxes = np.load(npy_df.loc[file, 'npy_file'], allow_pickle=True)
    boxes = convert_xywh2xyxy(standardize_wh(convert_xyxy2xywh(boxes.squeeze().astype(int)).astype(int)).astype(int)).astype(int)
    frames = get_det_from_video(video, boxes)
    save_frames(frames, out_path)

In [107]:
out_dir = '/media/mc/2TBNVMESSD/extracted_faces/test_set/real/'
for file in real_test_files:
    if not os.path.exists(str(Path(out_dir) / Path(file))):
        try:
            save_extracted_faces(file, out_dir)
        except:
            print(f'Error with {file}')

Error with vgeowvmdic.mp4
Error with paorozwtbf.mp4
Error with guyunvsdsx.mp4
Error with uqrwgslayc.mp4
Error with vnzfkjxkhr.mp4
Error with elxhrfxiqr.mp4
Error with ajbxbxcrdo.mp4
Error with gsufmjyjfd.mp4
Error with zvvjlkjvyi.mp4
Error with rlhmppiklf.mp4
Error with rtphoescwz.mp4
Error with fuegzrizzn.mp4


In [109]:
out_dir = '/media/mc/2TBNVMESSD/extracted_faces/test_set/fake/'
start = time.time()
for i, file in enumerate(fake_test_files):
    if i % 100 == 0:
        elapsed = time.time() - start
        print(f'{i}/{len(fake_test_files)} Elapsed: {elapsed:.2f}s')
        start = time.time()
    if not os.path.exists(str(Path(out_dir) / Path(file))):
        try:
            save_extracted_faces(file, out_dir)
        except:
            print(f'Error with {file}')

0/3528 Elapsed: 0.00s
Error with wzsghovwjh.mp4
Error with ytyfspbsul.mp4
100/3528 Elapsed: 85.74s
Error with xutjriqmyn.mp4
200/3528 Elapsed: 88.40s
Error with bbdwfeznwk.mp4
Error with rjuddimxce.mp4
300/3528 Elapsed: 93.28s
400/3528 Elapsed: 91.71s
Error with qumwfrunqq.mp4
Error with yarodrgcoe.mp4
Error with dlkifxjpem.mp4
500/3528 Elapsed: 91.28s
Error with nccbctegue.mp4
Error with euffkttktu.mp4
Error with pptcuafaqt.mp4
Error with dndvdfwmme.mp4
600/3528 Elapsed: 90.63s
Error with nlaevfisim.mp4
Error with lmtswyedyz.mp4
Error with sqfwamivwy.mp4
Error with ybcqxbweww.mp4
Error with vpsxodalkn.mp4
Error with dlhbkziruq.mp4
Error with dtusmifhor.mp4
Error with avhyjwzrdp.mp4
700/3528 Elapsed: 81.88s
Error with shopbbapgj.mp4
800/3528 Elapsed: 96.61s
Error with lpzuerojbb.mp4
Error with vhwshcljrl.mp4
Error with xdydyjggsb.mp4
Error with okluiruxjy.mp4
900/3528 Elapsed: 90.13s
1000/3528 Elapsed: 92.89s
1100/3528 Elapsed: 94.21s
Error with tcusewmlgy.mp4
1200/3528 Elapsed: 95.74s

In [110]:
out_dir = '/media/mc/2TBNVMESSD/extracted_faces/train_set/real/'
start = time.time()
for i, file in enumerate(real_train_files):
    if i % 100 == 0:
        elapsed = time.time() - start
        print(f'{i}/{len(fake_test_files)} Elapsed: {elapsed:.2f}s')
        start = time.time()
    if not os.path.exists(str(Path(out_dir) / Path(file))):
        try:
            save_extracted_faces(file, out_dir)
        except:
            print(f'Error with {file}')

out_dir = '/media/mc/2TBNVMESSD/extracted_faces/train_set/fake/'
start = time.time()
for i, file in enumerate(fake_train_files):
    if i % 100 == 0:
        elapsed = time.time() - start
        print(f'{i}/{len(fake_test_files)} Elapsed: {elapsed:.2f}s')
        start = time.time()
    if not os.path.exists(str(Path(out_dir) / Path(file))):
        try:
            save_extracted_faces(file, out_dir)
        except:
            print(f'Error with {file}')

0/3528 Elapsed: 0.00s
Error with ziwrbqsqty.mp4
Error with mkjrjuxgnw.mp4
100/3528 Elapsed: 91.64s
Error with kmscrlfgku.mp4
Error with hjpofokerg.mp4
Error with qbruydwzcc.mp4
200/3528 Elapsed: 91.95s
Error with psbaqedyql.mp4
300/3528 Elapsed: 95.69s
Error with mduqxrbmjd.mp4
Error with axvvrrujab.mp4
Error with hpxykzhgtf.mp4
400/3528 Elapsed: 95.04s
Error with lzpkfgnqhk.mp4
500/3528 Elapsed: 93.77s
600/3528 Elapsed: 90.42s
700/3528 Elapsed: 93.03s
Error with vwrrieyzck.mp4
800/3528 Elapsed: 91.36s
900/3528 Elapsed: 92.86s
Error with egxbehzmza.mp4
Error with fduhwlydlg.mp4
1000/3528 Elapsed: 90.44s
Error with umhwtpsifb.mp4
1100/3528 Elapsed: 91.74s
Error with dbgvpwesbv.mp4
Error with rnwrvccbns.mp4
1200/3528 Elapsed: 92.80s
Error with pitpjawwwp.mp4
1300/3528 Elapsed: 92.46s
1400/3528 Elapsed: 93.18s
1500/3528 Elapsed: 92.30s
1600/3528 Elapsed: 91.89s
Error with yhbhhibwzo.mp4
1700/3528 Elapsed: 91.49s
Error with cjwyfughav.mp4
1800/3528 Elapsed: 93.07s
1900/3528 Elapsed: 94.69s