In [2]:
import argparse
import os
import sys
from PIL import Image, PngImagePlugin
import glob
import numpy as np
np.set_printoptions(threshold=25)
import scipy.fftpack
import pandas as pd
import time
import h5py
start = time.time()

hash_size = 15
high_freq_factor = 4

wd_frames = '/home/emsala/Documenten/Studie/These/phashing/dataset/films/film_1/frames/*'
frames = sorted(glob.glob(wd_frames))


In [4]:
def get_trailer_frames(frames):
    def zero_padding_numbers(frame_name, len_number=4, frame_nr = False):
        if frame_nr == False:
            frame_nr = frame_name.split('frame')[1].split('.')[0]
            start = frame_name.split(frame_nr)[0]
            end = frame_name.split(frame_nr)[1]
        else:
            start = ''
            frame_nr = str(frame_name)
            end = ''
        len_padding = len_number-len(frame_nr)
        new_nr = ('0'*len_padding) + frame_nr
        new_name = start + new_nr + end
        return new_name

    shot1 = list(range(840, 1508))
    shot2 = list(range(1528, 1911))
    shot3 = list(range(2611, 2674))
    shot4 = list(range(3280, 3461))
    shot5 = list(range(4453, 4612))

    shots = shot1 + shot2 + shot3 + shot4 + shot5
    shot_numbers = [zero_padding_numbers(shot, frame_nr = True) for shot in shots]
    correct_frames = ['frame{}.jpg'.format(shot_number) for shot_number in shot_numbers]
    trailer_frames = [frame for frame in frames if frame.split('/')[-1] in correct_frames]    
    return trailer_frames

In [5]:
def get_hash(frame_path, hash_size, high_freq_factor):
    im = Image.open(frame_path).convert("RGB") 
    img_size = hash_size * high_freq_factor
    image = im.convert("L").resize((img_size, img_size), Image.ANTIALIAS)
    pixels = np.asarray(image)
    dct = scipy.fftpack.dct(scipy.fftpack.dct(pixels, axis=0), axis=1)
    dctlowfreq = dct[:hash_size, :hash_size]
    med = np.median(dctlowfreq)
    diff = dctlowfreq > med
    phash = [1 if x == True else 0 for x in diff.flatten()]
    return phash

In [2]:
def write_elementwise(frame_paths, size_hash, hash_size = 15, high_freq_factor = 4, name = ''):
    n_frames = len(frame_paths)
    
    hdf5_store = h5py.File("./phashes{}.hdf5".format('_'+name), "a")
    phashes = hdf5_store.create_dataset("phashes", (n_frames, size_hash), compression="gzip")

    for i in range(0, n_frames):
        frame_path = frame_paths[i]
        phash = get_hash(frame_path, hash_size, high_freq_factor)
        phashes[i] = phash
        
def write_batchwise(frame_paths, size_hash, hash_size = 15, high_freq_factor = 4, name = ''):
    n_frames = len(frame_paths)
    
    hdf5_store = h5py.File("./phashes{}.hdf5".format('_'+name), "a")
    phashes = np.zeros((n_frames, size_hash))

    
    for i in range(0, n_frames):
        frame_path = frame_paths[i]
        phash = get_hash(frame_path, hash_size, high_freq_factor)
        phashes[i] = phash

    hdf5_store.create_dataset("phashes", data = phashes, compression="gzip")

def test_writing_speed():
    start = time.time()
    write_elementwise(trailer_frames, 122)
    end = time.time()
    print('time for elementwise writing', end-start)
    #time for elementwise writing 42.066429138183594

    start = time.time()
    write_batchwise(trailer_frames, 122)
    end = time.time()
    print('time for batch writing', end-start)
    # time for batch writing 38.23404669761658



In [32]:
def write_hashes_to_hdf5(frame_paths, hash_size = 15, high_freq_factor = 4, name =  ''):
    size_hash = len(get_hash(frame_paths[0], hash_size, high_freq_factor))
    write_elementwise(frame_paths, size_hash, hash_size, high_freq_factor, name)

In [15]:
write_hashes_to_hdf5(trailer_frames, name = 'trailer')
write_hashes_to_hdf5(frames, name = 'movie')

time for elementwise writing 43.02076053619385
