Skip to content
Permalink
Browse files

fix(preprocessor): fix bug in ffmpeg.py and add more func to helper

  • Loading branch information...
Larryjianfeng committed Aug 2, 2019
1 parent 50a75d9 commit 7c16fb8b221accfbbf0afd77d63d908938ca808a
Showing with 40 additions and 8 deletions.
  1. +39 −1 gnes/preprocessor/helper.py
  2. +1 −7 gnes/preprocessor/video/ffmpeg.py
@@ -18,7 +18,7 @@
import io
import subprocess as sp
from typing import List, Callable

import os
import cv2
import numpy as np
from PIL import Image
@@ -28,6 +28,44 @@
logger = set_logger(__name__, True)


def get_video_length(video_path):
import subprocess
import re
process = subprocess.Popen(['ffmpeg', '-i', video_path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
stdout, stderr = process.communicate()
stdout = str(stdout)
matches = re.search(r"Duration:\s{1}(?P<hours>\d+?):(?P<minutes>\d+?):(?P<seconds>\d+\.\d+?),", stdout, re.DOTALL).groupdict()
h = float(matches['hours'])
m = float(matches['minutes'])
s = float(matches['seconds'])

return 3600 * h + 60 * m + s


def split_mp4_random(video_path, avg_length, max_clip_second=10):
import random

l = get_video_length(video_path)
s = []
num_part = max(int(l / avg_length), 2)

while sum(s) < l:
s.append(random.randint(3, max_clip_second))
s[-1] = int(l - sum(s[:-1]))
start = [sum(s[:i]) for i in range(len(s))]

ts_group = [[] for _ in range(num_part)]

for i in range(len(s)):
ts_group[i % num_part].append(' -ss {} -t {} -i {} '.format(start[i], s[i], video_path))

prefix = os.path.basename(video_path).replace('.mp4', '')
for i in range(num_part):
i_len = len(ts_group[i])
cmd = 'ffmpeg' + ''.join(ts_group[i]) + '-filter_complex "{}concat=n={}:v=1:a=1" -strict -2 {}_{}.mp4 -y'.format(''.join(['[{}]'.format(k) for k in range(i_len)]), i_len, prefix, i)
os.system(cmd)


def get_video_frames(buffer_data: bytes, image_format: str = 'cv2',
**kwargs) -> List['np.ndarray']:
ffmpeg_cmd = ['ffmpeg', '-i', '-', '-f', 'image2pipe']
@@ -25,14 +25,12 @@
class FFmpegPreprocessor(BaseVideoPreprocessor):

def __init__(self,
frame_size: str = '192*168',
duplicate_rm: bool = True,
use_phash_weight: bool = False,
phash_thresh: int = 5,
*args,
**kwargs):
super().__init__(*args, **kwargs)
self.frame_size = frame_size
self.phash_thresh = phash_thresh
self.duplicate_rm = duplicate_rm
self.use_phash_weight = use_phash_weight
@@ -45,11 +43,7 @@ def apply(self, doc: 'gnes_pb2.Document') -> None:
# video could't be processed from ndarray!
# only bytes can be passed into ffmpeg pipeline
if doc.raw_bytes:
frames = get_video_frames(
doc.raw_bytes,
s=self.frame_size,
vsync=self._ffmpeg_kwargs.get('vsync', 'vfr'),
vf=self._ffmpeg_kwargs.get('vf', 'select=eq(pict_type\\,I)'))
frames = get_video_frames(doc.raw_bytes, **self._ffmpeg_kwargs)

# remove dupliated key frames by phash value
if self.duplicate_rm:

0 comments on commit 7c16fb8

Please sign in to comment.
You can’t perform that action at this time.