# Results Summary
Implemented the algorithm by Allison Deal (https://github.com/allisonnicoledeal/VideoSync)

Results and visual inspection show the following delays:

+43.1 s => yt5s.com-Highway Tune-Greta Van Fleet-Red Rocks.wav

+5.18 s => yt5s.com-Greta van Fleet at Red Rocks - Highway Tune.wav

+0       => yt5s.com-Greta Van Fleet - Highway Tune - 09-23-2019 Red Rocks.wav

In [20]:

TEST_FILE_DIR = 'test-clips/greta-van-fleet'
VIDEO_TYPES = ['*.mp4']
AUDIO_TYPE = '.wav'
TMP_DIR = 'tmp'
if not os.path.exists(TMP_DIR):
    os.mkdir(TMP_DIR)
  


videos = []
audios = []
for fn in os.listdir(TEST_FILE_DIR):
    if any([fnmatch.fnmatch(fn, t) for t in VIDEO_TYPES]):
        print(f'loading {fn}')

        full_path = os.path.join(TEST_FILE_DIR, fn)
        clip = mp.VideoFileClip(full_path)
        videos.append(clip)

        # Save to temp audio file
        audio_fn = os.path.splitext(fn)[0] + AUDIO_TYPE
        audio_path = os.path.join(TMP_DIR, audio_fn)
        if os.path.exists(audio_path):
            print('Audio File already saved')
        else:
            print('Saving to temp path')
            clip.audio.write_audiofile(audio_path)
    
        # Read the Audio
        rate, data = scipy.io.wavfile.read(audio_path)
        ch1 = np.array([d[0] for d in data])
        ch2 = np.array([d[1] for d in data])

        ch1_float = ch1.astype(np.float32)
        if ch1.dtype == 'int16':
            ch1_float = ch1_float / (2 ** 15)

        audios.append({
            'data': ch1,
            'data_float': ch1_float,
            'rate': rate,
            'path': audio_path
        })

loading yt5s.com-Highway Tune-Greta Van Fleet-Red Rocks.mp4
Audio File already saved
loading yt5s.com-Greta Van Fleet - Highway Tune - 09-23-2019 Red Rocks.mp4
Audio File already saved
loading yt5s.com-Greta van Fleet at Red Rocks - Highway Tune.mp4
Audio File already saved


In [5]:
def make_horiz_bins(data, fft_bin_size, overlap, box_height):
    horiz_bins = {}
    # process first sample and set matrix height
    sample_data = data[0:fft_bin_size]  # get data for first sample
    if (len(sample_data) == fft_bin_size):  # if there are enough audio points left to create a full fft bin
        intensities = fourier(sample_data)  # intensities is list of fft results
        for i in range(len(intensities)):
            box_y = int(i/box_height)
            if box_y in horiz_bins:
                horiz_bins[box_y].append((intensities[i], 0, i))  # (intensity, x, y)
            else:
                horiz_bins[box_y] = [(intensities[i], 0, i)]
    # process remainder of samples
    x_coord_counter = 1  # starting at second sample, with x index 1
    for j in range(int(fft_bin_size - overlap), len(data), int(fft_bin_size-overlap)):
        sample_data = data[j:j + fft_bin_size]
        if (len(sample_data) == fft_bin_size):
            intensities = fourier(sample_data)
            for k in range(len(intensities)):
                box_y = int(k/box_height)
                if box_y in horiz_bins:
                    horiz_bins[box_y].append((intensities[k], x_coord_counter, k))  # (intensity, x, y)
                else:
                    horiz_bins[box_y] = [(intensities[k], x_coord_counter, k)]
        x_coord_counter += 1

    return horiz_bins


def fourier(sample):  #, overlap):
    return np.abs(np.fft.rfft(sample))

In [7]:
def make_vert_bins(horiz_bins, box_width):
    boxes = {}
    for key in horiz_bins.keys():
        for i in range(len(horiz_bins[key])):
            box_x = int(horiz_bins[key][i][1] / box_width)
            if (box_x,key) in boxes:
                boxes[(box_x,key)].append((horiz_bins[key][i]))
            else:
                boxes[(box_x,key)] = [(horiz_bins[key][i])]

    return boxes

In [11]:
def find_bin_max(boxes, maxes_per_box):
    freqs_dict = {}
    for key in boxes.keys():
        max_intensities = [(1,2,3)]
        for i in range(len(boxes[key])):
            if boxes[key][i][0] > min(max_intensities)[0]:
                if len(max_intensities) < maxes_per_box:  # add if < number of points per box
                    max_intensities.append(boxes[key][i])
                else:  # else add new number and remove min
                    max_intensities.append(boxes[key][i])
                    max_intensities.remove(min(max_intensities))
        for j in range(len(max_intensities)):
            if max_intensities[j][2] in freqs_dict:
                freqs_dict[max_intensities[j][2]].append(max_intensities[j][1])
            else:
                freqs_dict[max_intensities[j][2]] = [max_intensities[j][1]]

    return freqs_dict

In [13]:
def find_freq_pairs(freqs_dict_orig, freqs_dict_sample):
    time_pairs = []
    for key in freqs_dict_sample.keys():  # iterate through freqs in sample
        if key in freqs_dict_orig:  # if same sample occurs in base
            for i in range(len(freqs_dict_sample[key])):  # determine time offset
                for j in range(len(freqs_dict_orig[key])):
                    time_pairs.append((freqs_dict_sample[key][i], freqs_dict_orig[key][j]))

    return time_pairs

In [18]:
def find_delay(time_pairs):
    t_diffs = {}
    for i in range(len(time_pairs)):
        delta_t = time_pairs[i][0] - time_pairs[i][1]
        if delta_t in t_diffs:
            t_diffs[delta_t] += 1
        else:
            t_diffs[delta_t] = 1
    t_diffs_sorted = sorted(t_diffs.items(), key=lambda x: x[1])
    time_delay = t_diffs_sorted[-1][0]

    return time_delay

In [49]:
# h1 = make_horiz_bins(audios[0]['data'][:44100*120], 1024, 0, 512)
# h2 = make_horiz_bins(audios[1]['data'][:44100*60], 1024, 0, 512)
h1 = make_horiz_bins(audios[1]['data'], 1024, 0, 512)
h2 = make_horiz_bins(audios[2]['data'], 1024, 0, 512)


2
dict_keys([0, 1])


In [54]:

len(h1[1])

11372

In [42]:
v1 = make_vert_bins(h1, 43)
v2 = make_vert_bins(h2, 43)

In [52]:
v1.keys()

dict_keys([(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 0), (12, 0), (13, 0), (14, 0), (15, 0), (16, 0), (17, 0), (18, 0), (19, 0), (20, 0), (21, 0), (22, 0), (23, 0), (24, 0), (25, 0), (26, 0), (27, 0), (28, 0), (29, 0), (30, 0), (31, 0), (32, 0), (33, 0), (34, 0), (35, 0), (36, 0), (37, 0), (38, 0), (39, 0), (40, 0), (41, 0), (42, 0), (43, 0), (44, 0), (45, 0), (46, 0), (47, 0), (48, 0), (49, 0), (50, 0), (51, 0), (52, 0), (53, 0), (54, 0), (55, 0), (56, 0), (57, 0), (58, 0), (59, 0), (60, 0), (61, 0), (62, 0), (63, 0), (64, 0), (65, 0), (66, 0), (67, 0), (68, 0), (69, 0), (70, 0), (71, 0), (72, 0), (73, 0), (74, 0), (75, 0), (76, 0), (77, 0), (78, 0), (79, 0), (80, 0), (81, 0), (82, 0), (83, 0), (84, 0), (85, 0), (86, 0), (87, 0), (88, 0), (89, 0), (90, 0), (91, 0), (92, 0), (93, 0), (94, 0), (95, 0), (96, 0), (97, 0), (98, 0), (99, 0), (100, 0), (101, 0), (102, 0), (103, 0), (104, 0), (105, 0), (106, 0), (107, 0), (108, 0), (109, 0),

In [43]:
max1 = find_bin_max(v1, 7)
max2 = find_bin_max(v2, 7)

In [61]:
len(max1)

35

In [44]:
pairs = find_freq_pairs(max1, max2)

In [57]:
print(len(pairs))

4471776


In [45]:
rate = 44100
fft_bin_size = 1024

delay = find_delay(pairs)
samples_per_sec = float(rate) / float(fft_bin_size)
seconds= round(float(delay) / float(samples_per_sec), 4)

print(delay, samples_per_sec, seconds)

223 43.06640625 5.178


In [47]:
def get_delay(src1, src2, rate, fft_bin_size=1024):
    h1 = make_horiz_bins(src1, 1024, 0, 512)
    v1 = make_vert_bins(h1, 43)
    max1 = find_bin_max(v1, 7)
    
    h2 = make_horiz_bins(src2, 1024, 0, 512)
    v2 = make_vert_bins(h2, 43)
    max2 = find_bin_max(v2, 7)
    
    pairs = find_freq_pairs(max1, max2)

    delay = find_delay(pairs)
    samples_per_sec = float(rate) / float(fft_bin_size)
    seconds= round(float(delay) / float(samples_per_sec), 4)
    
    return seconds

In [48]:
import datetime as dt

start_time = dt.datetime.now()
print(get_delay(audios[0]['data'], audios[1]['data'], audios[0]['rate']))
print(f'Total Time: {(dt.datetime.now() - start_time).total_seconds()}')

-43.0962
Total Time: 20.782562
