# Preparation

In [1]:
import os
import h5py
from tqdm.notebook import tqdm
from typing import Union

import io
from PIL import Image
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import cv2

## Helper functions

In [11]:
def forever_run():
    '''
    Prevent Colab from disconnecting
    '''
    while True:
        pass

def mkdir(path):
    '''
    Check if the destination folder exists
    If not, create one

        Args:
            path: path of the folder
    '''
    folder = os.path.exists(path)
    if not folder:
        os.makedirs(path)
        print('-----Folder created-----')
    else:
        print('-----Folder exists-----')


def extract_frames(filename: str, file_dst: str, frame_per_second: int, start_time: Union[list, int]=None, end_time: Union[list, int]=None):
    """
    Extract the frames from video

    Args:
        cat_code: the category code of the video
        filename: the name of video
        file_dst: the path of destination folder
        frame_per_second: extract frames per second
        start_time: the list of start time
        end_time: the list of end time
    
    Returns:
        None, create an image for each selected frame

    Raises:
        ValueError: Error! length of start time list must be equal to length of end time list
        ValueError: Error! time in end time list exceeds total seconds
        ValueError: Error! start time is bigger than end time
    """
    cap = cv2.VideoCapture(filename)
    if not cap.isOpened():
        return

    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    total_seconds = total_frames / fps
    
    if isinstance(start_time, list) and isinstance(end_time, list) and len(start_time) != len(end_time):
        raise ValueError("Error! length of start time list must be equal to length of end time list")
    
    if isinstance(start_time, int):
        start_time = [start_time]
    if isinstance(end_time, int):
        end_time = [end_time]
    if not start_time:
        start_time = [0]
    if not end_time:
        end_time = [total_seconds]
    
    check_list = [start <= end for start, end in zip(start_time, end_time)]
    if max(end_time) > total_seconds:
        raise ValueError("Error! time in end time list exceeds total seconds")
    elif not min(check_list):
        raise ValueError("Error! start time is bigger than end time")

    # Initialization
    frameCount = 0
    
    start_frame_list = [round(time * fps) for time in start_time]
    end_frame_list = [round(time * fps) for time in end_time]
    duration_of_frame = round(fps / frame_per_second)

    print("Original FPS: ", fps)
    print("Extracted FPS: ", frame_per_second)
    print("Number of frames: ", total_frames)
    print("Total seconds: ", total_seconds)
    print("Extracting...")

    mkdir(file_dst)
    # Extract frames from file
    for start, end in zip(start_frame_list, end_frame_list):
        cap.set(cv2.CAP_PROP_POS_FRAMES, start - 1)
        pbar = tqdm(total=end-start)
        while start < end:
            ret, image = cap.read()
            if not ret:
                break
            cv2.imwrite(file_dst + ("/{:0>6d}".format(frameCount)) + '.jpg', image)
            start += duration_of_frame
            pbar.update(duration_of_frame)
            cap.set(cv2.CAP_PROP_POS_FRAMES, start - 1)
            frameCount += 1
        pbar.close()
    print("Finish")
    print()


def build_h5file(h5file_path, image_folder):
    print("Image folder: ", image_folder)

    fileNames = os.listdir(image_folder)
    pbar = tqdm(total=len(fileNames), desc="Progress", ncols=700) 
    for index, image_id in enumerate(fileNames):
    
        img_path = os.path.join(image_folder, image_id)
        binary_data = open(img_path, 'rb').read()     # read the image as python binary ('rb': read binary)
        binary_data_np = np.asarray(binary_data)

        try:
            with h5py.File(h5file_path, 'a') as hf:
                hf.create_dataset(name = image_id, data = binary_data_np)
            pbar.update(1)
        except:
            print('Failed to load ',img_path)
            pbar.update(1)
    hf.close()
def get_frame_index(filename: str, frame_per_second: int, start_time: Union[list, int]=None, end_time: Union[list, int]=None):
    cap = cv2.VideoCapture(filename)
    if not cap.isOpened():
        return

    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    total_seconds = total_frames / fps
    
    if isinstance(start_time, list) and isinstance(end_time, list) and len(start_time) != len(end_time):
        raise ValueError("Error! length of start time list must be equal to length of end time list")
    
    if isinstance(start_time, int):
        start_time = [start_time]
    if isinstance(end_time, int):
        end_time = [end_time]
    if not start_time:
        start_time = [0]
    if not end_time:
        end_time = [total_seconds]
    
    check_list = [start <= end for start, end in zip(start_time, end_time)]
    if max(end_time) > total_seconds:
        raise ValueError("Error! time in end time list exceeds total seconds")
    elif not min(check_list):
        raise ValueError("Error! start time is bigger than end time")
    
    start_frame_list = [round(time * fps) for time in start_time]
    end_frame_list = [round(time * fps) for time in end_time]
    duration_of_frame = round(fps / frame_per_second)

    print("Original FPS: ", fps)
    print("Extracted FPS: ", frame_per_second)
    print("Number of frames: ", total_frames)
    print("Total seconds: ", total_seconds)
    
    frame_indexs = np.array([], dtype=np.int)
    for start, end in zip(start_frame_list, end_frame_list):
        frame_indexs = np.append(frame_indexs, np.arange(start, end, duration_of_frame, np.int))
    return frame_indexs

In [63]:
def normalize_frame_summary_indices(base_folder, file_paths, mode='min', dataset='BBC'):
    for file_path in file_paths:
        print(file_path)
        with h5py.File(base_folder + "/" + file_path, 'r') as f:
            frame_indices = f['frame_indices'][()]
            if dataset is 'BBC':
                annotators = f['labels'].items()
            else:
                labels = f['labels'][()]
                print(labels)
        # print(frame_indices)
        if dataset is 'BBC':
            for name, labels in annotators:
                result = []
                for label in labels:
                    start, end = label
                    tmp1 = np.abs(frame_indices - start)
                    tmp2 = np.abs(frame_indices - end)
                    if mode is 'min':
                        start = np.min(tmp1)
                        end = np.min(tmp2)
                    elif mode is 'in_range':
                        index = np.argmin(tmp1)
                        start = frame_indices[index] >= start if frame_indices[index] else frame_indices[index+1]
                        index = np.argmin(tmp2)
                        end = frame_indices[index] <= end if frame_indices[index] else frame_indices[index-1]
                    elif mode is 'out_range':
                        index = np.argmin(tmp1)
                        start = frame_indices[index] <= start if frame_indices[index] else frame_indices[index-1]
                        index = np.argmin(tmp2)
                        end = frame_indices[index] >= end if frame_indices[index] else frame_indices[index+1]
                    result.append((start, end))
                annotators[name] = np.array(result)
            result = annotators
        else:
            result = []
            for label in labels:
                start, end = label
                tmp1 = np.abs(frame_indices - start)
                tmp2 = np.abs(frame_indices - end)
                if mode is 'min':
                    start = frame_indices[np.argmin(tmp1)]
                    end = frame_indices[np.argmin(tmp2)]
                elif mode is 'in_range':
                    index = np.argmin(tmp1)
                    print(frame_indices, frame_indices[index], index)
                    start = frame_indices[index] if frame_indices[index] >= start else frame_indices[index+1]
                    index = np.argmin(tmp2)
                    end = frame_indices[index] if frame_indices[index] <= end else frame_indices[index-1]
                elif mode is 'out_range':
                    index = np.argmin(tmp1)
                    start = frame_indices[index] if frame_indices[index] <= start else frame_indices[index-1]
                    index = np.argmin(tmp2)
                    end = frame_indices[index] if frame_indices[index] >= end else frame_indices[index+1]
                result.append((start, end))
            result = np.array(result)

        print(result)
        # with h5py.File(base_folder + "/" + file_path, 'r+') as f:
        #     del f['labels']
        #     f['labels'] = result

## Configs

In [None]:
class Configs():
    # Paths
    videos_dir = 'videos'
    info_file_path = 'OVSD.tsv'
    video_labels = 'labels'
    h5_folderpath = 'h5py_files'

opt = Configs()

## Copy files to env

In [None]:
!rm -rf videos
!rm -rf h5py_files
!rm -rf labels
!mkdir videos
!mkdir h5py_files
# !mkdir labels
# !cp -r "drive/Shareddrives/Surgery Video Research Team/Videos/OVSD/video" "videos/OVSD"
!cp -r "drive/Shareddrives/Surgery Video Research Team/Videos/bbc_planet_earth/video" "videos/BBC"
# !mv "videos/OVSD/labels" "labels/OVSD"
# !mv "videos/BBC/labels" "labels/BBC"

In [None]:
!cp -r "drive/Shareddrives/Surgery Video Research Team/Frames/Scenes_datasets/labels" "."

In [None]:
!rm -rf "labels/OVSD/new"

# Preprocessing

*   Dataset: OVSD
    *   Videos: 50 videos 20 videos (15min ~ 1hrs 33min (animation + movie))
    
    *   Annotation: For every video, there are multiple range of frames, each annotaion is one scene.

*    Dataset: BBC Planet Earth
    *   Videos: 11 videos (50min/video)
    *   Annotation: Each episode has ground truth shots and ground truth scenes for eacg annotator (5 annotators).
        *   shot: the range of frames

        *   scene: starting point of shots

## Create labels file

In [None]:
except_files = ["License.txt", "README.txt"]

mkdir(opt.video_labels + '/OVSD/new')

for label in label_files:
    with open(opt.video_labels + '/OVSD/' + label, 'r') as f:
        annotations = f.readlines()
    new_annot = []
    for annot in annotations:
        annot = annot.strip('\n').split('\t')
        new_annot.append(annot)
    with open(opt.video_labels + '/OVSD/new/' + label[:-4] + '.csv', 'w') as f:
        contents = []
        for annot in new_annot:
            contents.append(f"{annot[0]},{annot[1]}\n")
        f.writelines(contents)

-----Folder created-----


In [None]:
!rm -rf "labels/BBC/new"

In [None]:
label_shots = os.listdir(opt.video_labels + '/BBC/annotations/shots')
label_scenes_dir = os.listdir(opt.video_labels + '/BBC/annotations/scenes')

mkdir(opt.video_labels + '/BBC/new')

for dir in label_scenes_dir:
    mkdir(opt.video_labels + '/BBC/new/' + dir)
    for label in os.listdir(opt.video_labels + '/BBC/annotations/scenes/' + dir):
        with open(opt.video_labels + '/BBC/annotations/scenes/' + dir + '/' + label, 'r') as f:
            scene_annotations = f.readline().strip('\n').split(',')

            with open(opt.video_labels + '/BBC/annotations/shots/' + label, 'r') as h:
                shot_annotations = h.readlines()
                shot_annotations = [annot.strip('\n').split('\t') for annot in shot_annotations]
            
            new_annotations = []
            for i in range(len(scene_annotations)-1):
                ranges = [shot_annotations[int(scene_annotations[i])][0], shot_annotations[int(scene_annotations[i+1]) - 1][1]]
                new_annotations.append(ranges)
            
            if len(shot_annotations) != int(scene_annotations[-1]):
                new_annotations.append([shot_annotations[int(scene_annotations[-1])][0], shot_annotations[-1][1]])

            with open(opt.video_labels + '/BBC/new/'+ dir + '/' + label[:-4] + '.csv', 'w') as f:
                contents = []
                for annot in new_annotations:
                    contents.append(f"{annot[0]},{annot[1]}\n")
                f.writelines(contents)

-----Folder exists-----
-----Folder created-----
-----Folder created-----
-----Folder created-----
-----Folder created-----
-----Folder created-----


## Create h5 dataset file

### Extract all the frames from every video

The name of frames are as the form:



```
videoName_frameNum
```





In [None]:
mkdir('frames') # Create frame folder

-----Folder created-----


In [85]:
OVSD = os.listdir('OVSD/video')

for i in ['19_Valkaama.mkv']:
    print(f"processing: {i}")
    input_file_path = 'OVSD/video/' + i
    frame_path = 'datasets/frames/' + i[:-4]
    extract_frames(filename = input_file_path, file_dst = frame_path, frame_per_second = 3)

processing: 19_Valkaama.mkv
Original FPS:  25.0
Extracted FPS:  3
Number of frames:  139649.0
Total seconds:  5585.96
Extracting...
-----Folder created-----


  0%|          | 0/139649 [00:00<?, ?it/s]

Finish



In [57]:
OVSD = [i for i in os.listdir('OVSD/video') if i not in ['.ipynb_checkpoints']]
result = []
for i in OVSD:
    print(f"processing: {i}")
    input_file_path = 'OVSD/video/' + i
    result.append(get_frame_index(filename = input_file_path, frame_per_second = 5))

OVSD_hdf5 = [i for i in os.listdir('features/OVSD') if i not in ['.ipynb_checkpoints']]
for i, filename in enumerate(OVSD_hdf5):
    with h5py.File('features/OVSD/' + filename, 'r+') as f:
        if 'frame_indices'
        del f['frame_indices']
        f['frame_indices'] = result[i]

processing: 01_ElephantsDream.avi
Original FPS:  24.00003840006144
Extracted FPS:  5
Number of frames:  15691.0
Total seconds:  653.7906206
processing: 02_Fires_beneath_water.avi
Original FPS:  29.97002997002997
Extracted FPS:  5
Number of frames:  136853.0
Total seconds:  4566.328433333333
processing: 03_OCEANIA_Creative_Commons.avi
Original FPS:  23.976023976023978
Extracted FPS:  5
Number of frames:  78148.0
Total seconds:  3259.422833333333
processing: 04_1000_DAYS.mov
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  64615.0
Total seconds:  2584.6
processing: 05_BigBuckBunny.mp4
Original FPS:  24.0
Extracted FPS:  5
Number of frames:  14315.0
Total seconds:  596.4583333333334
processing: 06_Boy_Who_Never_Slept.mp4
Original FPS:  29.97002997002997
Extracted FPS:  5
Number of frames:  125469.0
Total seconds:  4186.4823
processing: 07_CH7.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  129713.0
Total seconds:  5188.52
processing: 08_Cosmos_Laundromat_First_Cycle.m

[array([    0,     5,    10, ..., 15680, 15685, 15690]),
 array([     0,      6,     12, ..., 136836, 136842, 136848]),
 array([    0,     5,    10, ..., 78135, 78140, 78145]),
 array([    0,     5,    10, ..., 64600, 64605, 64610]),
 array([    0,     5,    10, ..., 14300, 14305, 14310]),
 array([     0,      6,     12, ..., 125454, 125460, 125466]),
 array([     0,      5,     10, ..., 129700, 129705, 129710]),
 array([    0,     5,    10, ..., 17520, 17525, 17530]),
 array([     0,      6,     12, ..., 156090, 156096, 156102]),
 array([    0,     6,    12, ..., 37782, 37788, 37794]),
 array([    0,     5,    10, ..., 15575, 15580, 15585]),
 array([    0,     6,    12, ..., 21534, 21540, 21546]),
 array([     0,      5,     10, ..., 148790, 148795, 148800]),
 array([    0,     5,    10, ..., 82070, 82075, 82080]),
 array([    0,     5,    10, ..., 21300, 21305, 21310]),
 array([     0,      6,     12, ..., 146724, 146730, 146736]),
 array([     0,      5,     10, ..., 154855, 154860,

In [60]:
BBC = [i for i in os.listdir('BBC/video') if i not in ['.ipynb_checkpoints']]
result = []
for i in BBC:
    print(f"processing: {i}")
    input_file_path = 'BBC/video/' + i
    result.append(get_frame_index(filename = input_file_path, frame_per_second = 5))

BBC_hdf5 = [i for i in os.listdir('features/BBC') if i not in ['.ipynb_checkpoints']]
for i, filename in enumerate(BBC_hdf5):
    with h5py.File('features/BBC/' + filename, 'r+') as f:
        del f['frame_indices']
        f['frame_indices'] = result[i]

processing: 01_From_Pole_to_Pole.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73875.0
Total seconds:  2955.0
processing: 02_Mountains.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  72115.0
Total seconds:  2884.6
processing: 03_Ice_Worlds.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73925.0
Total seconds:  2957.0
processing: 04_Great_Plains.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73575.0
Total seconds:  2943.0
processing: 05_Jungles.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73850.0
Total seconds:  2954.0
processing: 06_Seasonal_Forests.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73985.0
Total seconds:  2959.4
processing: 07_Fresh_Water.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73935.0
Total seconds:  2957.4
processing: 08_Ocean_Deep.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73850.0
Total seconds:  2954.0
processing: 09_Shallow_Seas.mp4
Original FPS

[array([    0,     5,    10, ..., 73860, 73865, 73870]),
 array([    0,     5,    10, ..., 72100, 72105, 72110]),
 array([    0,     5,    10, ..., 73910, 73915, 73920]),
 array([    0,     5,    10, ..., 73560, 73565, 73570]),
 array([    0,     5,    10, ..., 73835, 73840, 73845]),
 array([    0,     5,    10, ..., 73970, 73975, 73980]),
 array([    0,     5,    10, ..., 73920, 73925, 73930]),
 array([    0,     5,    10, ..., 73835, 73840, 73845]),
 array([    0,     5,    10, ..., 73835, 73840, 73845]),
 array([    0,     5,    10, ..., 73360, 73365, 73370]),
 array([    0,     5,    10, ..., 73480, 73485, 73490])]

In [61]:
BBC_hdf5 = [i for i in os.listdir('features/BBC') if i not in ['.ipynb_checkpoints']]
for i, filename in enumerate(BBC_hdf5):
    with h5py.File('features/BBC/' + filename, 'r+') as f:
        del f['frame_indices']
        f['frame_indices'] = result[i]

In [59]:
OVSD_hdf5 = [i for i in os.listdir('features/OVSD') if i not in ['.ipynb_checkpoints']]
for i, filename in enumerate(OVSD_hdf5):
    with h5py.File('features/OVSD/' + filename, 'r+') as f:
        del f['frame_indices']
        f['frame_indices'] = result[i]

In [55]:
OVSD_hdf5

['01_ElephantsDream.hdf5',
 '02_Fires_beneath_water.hdf5',
 '04_1000_DAYS.hdf5',
 '05_BigBuckBunny.hdf5',
 '06_Boy_Who_Never_Slept.hdf5',
 '07_CH7.hdf5',
 '08_Cosmos_Laundromat_First_Cycle.hdf5',
 '09_Honey_final.hdf5',
 "10_Jathia's Wager Public Domain Universe.hdf5",
 '11_La_chute_d_une_plume.hdf5',
 '12_Meridian_Netflix.hdf5',
 '13_Route_66.hdf5',
 '14_Seven_Dead_Men.hdf5',
 '15_Sintel.hdf5',
 '16_SITA_SINGS_MOVIE_ONLY.hdf5',
 '17_Star_Wreck_In_the_Pirkinning.hdf5',
 '18_Tears_of_steel.hdf5',
 '19_Valkaama.hdf5',
 '20_Pentagon.hdf5']

In [64]:
OVSD_hdf5 = [i for i in os.listdir('features/OVSD') if i not in ['.ipynb_checkpoints']]
normalize_frame_summary_indices('features/OVSD', OVSD_hdf5, mode='in_range', dataset='OVSD')

01_ElephantsDream.hdf5
[[   50   650]
 [  655  1670]
 [ 1675  5375]
 [ 5380  7015]
 [ 7020  9240]
 [ 9265 10235]
 [10240 11075]
 [11080 13485]
 [13490 15690]]
[    0     5    10 ... 15680 15685 15690] 50 10
[    0     5    10 ... 15680 15685 15690] 655 131
[    0     5    10 ... 15680 15685 15690] 1675 335
[    0     5    10 ... 15680 15685 15690] 5380 1076
[    0     5    10 ... 15680 15685 15690] 7020 1404
[    0     5    10 ... 15680 15685 15690] 9265 1853
[    0     5    10 ... 15680 15685 15690] 10240 2048
[    0     5    10 ... 15680 15685 15690] 11080 2216
[    0     5    10 ... 15680 15685 15690] 13490 2698
[[   50   650]
 [  655  1670]
 [ 1675  5375]
 [ 5380  7015]
 [ 7020  9240]
 [ 9265 10235]
 [10240 11075]
 [11080 13485]
 [13490 15690]]
02_Fires_beneath_water.hdf5
[[     0   2304]
 [  2310   3324]
 [  3330   4884]
 [  4890   6084]
 [  6090   8118]
 [  8124   9918]
 [  9924  10488]
 [ 10494  17712]
 [ 17718  18282]
 [ 18288  23586]
 [ 23592  29730]
 [ 29736  34014]
 [ 34020 

IndexError: index 2863 is out of bounds for axis 0 with size 2863

In [77]:
BBC = [i for i in os.listdir('BBC/video') if i not in ['.ipynb_checkpoints']]
for i in BBC:
    print(f"processing: {i}")
    input_file_path = 'BBC/video/' + i
    frame_path = 'datasets/frames/BBC/' + i[:-4]
    extract_frames(filename = input_file_path, file_dst = frame_path, frame_per_second = 5)

processing: 01_From_Pole_to_Pole.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73875.0
Total seconds:  2955.0
Extracting...
-----Folder exists-----


  0%|          | 0/73875 [00:00<?, ?it/s]

Finish

processing: 02_Mountains.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  72115.0
Total seconds:  2884.6
Extracting...
-----Folder exists-----


  0%|          | 0/72115 [00:00<?, ?it/s]

Finish

processing: 03_Ice_Worlds.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73925.0
Total seconds:  2957.0
Extracting...
-----Folder exists-----


  0%|          | 0/73925 [00:00<?, ?it/s]

Finish

processing: 04_Great_Plains.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73575.0
Total seconds:  2943.0
Extracting...
-----Folder exists-----


  0%|          | 0/73575 [00:00<?, ?it/s]

Finish

processing: 05_Jungles.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73850.0
Total seconds:  2954.0
Extracting...
-----Folder exists-----


  0%|          | 0/73850 [00:00<?, ?it/s]

Finish

processing: 06_Seasonal_Forests.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73985.0
Total seconds:  2959.4
Extracting...
-----Folder exists-----


  0%|          | 0/73985 [00:00<?, ?it/s]

Finish

processing: 07_Fresh_Water.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73935.0
Total seconds:  2957.4
Extracting...
-----Folder exists-----


  0%|          | 0/73935 [00:00<?, ?it/s]

Finish

processing: 08_Ocean_Deep.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73850.0
Total seconds:  2954.0
Extracting...
-----Folder exists-----


  0%|          | 0/73850 [00:00<?, ?it/s]

Finish

processing: 09_Shallow_Seas.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73850.0
Total seconds:  2954.0
Extracting...
-----Folder exists-----


  0%|          | 0/73850 [00:00<?, ?it/s]

Finish

processing: 10_Caves.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73375.0
Total seconds:  2935.0
Extracting...
-----Folder exists-----


  0%|          | 0/73375 [00:00<?, ?it/s]

Finish

processing: 11_Deserts.mp4
Original FPS:  25.0
Extracted FPS:  5
Number of frames:  73492.0
Total seconds:  2939.68
Extracting...
-----Folder exists-----


  0%|          | 0/73492 [00:00<?, ?it/s]

Finish



### Create h5
Do not run this on Colab, images are way too many.

In [78]:
!mkdir "datasets/h5py_files"
!mkdir "datasets/h5py_files/BBC"
!mkdir "datasets/h5py_files/OVSD"

In [84]:
with h5py.File("datasets/h5py_files/OVSD/01_ElephantsDream.hdf5") as f:
    print(list(f.keys()))

0.jpg', '001711.jpg', '001712.jpg', '001713.jpg', '001714.jpg', '001715.jpg', '001716.jpg', '001717.jpg', '001718.jpg', '001719.jpg', '001720.jpg', '001721.jpg', '001722.jpg', '001723.jpg', '001724.jpg', '001725.jpg', '001726.jpg', '001727.jpg', '001728.jpg', '001729.jpg', '001730.jpg', '001731.jpg', '001732.jpg', '001733.jpg', '001734.jpg', '001735.jpg', '001736.jpg', '001737.jpg', '001738.jpg', '001739.jpg', '001740.jpg', '001741.jpg', '001742.jpg', '001743.jpg', '001744.jpg', '001745.jpg', '001746.jpg', '001747.jpg', '001748.jpg', '001749.jpg', '001750.jpg', '001751.jpg', '001752.jpg', '001753.jpg', '001754.jpg', '001755.jpg', '001756.jpg', '001757.jpg', '001758.jpg', '001759.jpg', '001760.jpg', '001761.jpg', '001762.jpg', '001763.jpg', '001764.jpg', '001765.jpg', '001766.jpg', '001767.jpg', '001768.jpg', '001769.jpg', '001770.jpg', '001771.jpg', '001772.jpg', '001773.jpg', '001774.jpg', '001775.jpg', '001776.jpg', '001777.jpg', '001778.jpg', '001779.jpg', '001780.jpg', '001781.jpg'

In [86]:
OVSD = [i for i in os.listdir('datasets/frames/OVSD') if i not in ['.ipynb_checkpoints']]
for i in OVSD:
    image_folder = 'datasets/frames/' + i
    h5file_path = 'datasets/h5py_files/' + i + '.hdf5'
    
    build_h5file(h5file_path, image_folder)
    
    # Check keys
    check = h5py.File(h5file_path, 'r')
    if check:
        print("OK")
    else:
        print("Error")

Image folder:  datasets/frames/19_Valkaama


Progress:   0%|                                                                                               …

OK


In [80]:
BBC = [i for i in os.listdir('datasets/frames/BBC') if i not in ['.ipynb_checkpoints']]
for i in BBC:
    image_folder = 'datasets/frames/BBC/' + i
    h5file_path = 'datasets/h5py_files/BBC/' + i + '.hdf5'
    
    build_h5file(h5file_path, image_folder)
    
    # Check keys
    check = h5py.File(h5file_path, 'r')
    if check:
        print("OK")
    else:
        print("Error")

Image folder:  datasets/frames/BBC/01_From_Pole_to_Pole


Progress:   0%|                                                                                               …

OK
Image folder:  datasets/frames/BBC/02_Mountains


Progress:   0%|                                                                                               …

OK
Image folder:  datasets/frames/BBC/03_Ice_Worlds


Progress:   0%|                                                                                               …

OK
Image folder:  datasets/frames/BBC/04_Great_Plains


Progress:   0%|                                                                                               …

OK
Image folder:  datasets/frames/BBC/05_Jungles


Progress:   0%|                                                                                               …

OK
Image folder:  datasets/frames/BBC/06_Seasonal_Forests


Progress:   0%|                                                                                               …

OK
Image folder:  datasets/frames/BBC/07_Fresh_Water


Progress:   0%|                                                                                               …

OK
Image folder:  datasets/frames/BBC/08_Ocean_Deep


Progress:   0%|                                                                                               …

OK
Image folder:  datasets/frames/BBC/09_Shallow_Seas


Progress:   0%|                                                                                               …

OK
Image folder:  datasets/frames/BBC/10_Caves


Progress:   0%|                                                                                               …

OK
Image folder:  datasets/frames/BBC/11_Deserts


Progress:   0%|                                                                                               …

OK


In [None]:
forever_run()

In [None]:
label_shots = os.listdir('BBC/labels/annotations/shots')

label_scenes_dir = os.listdir('BBC/labels/annotations/scenes')

mkdir('BBC/labels/new/scenes')
mkdir('BBC/labels/new/shots')
for dir in label_scenes_dir:

    mkdir('BBC/labels/new/scenes/' + dir)
    
    for label in os.listdir('BBC/labels/annotations/scenes/' + dir):
        with open('BBC/labels/annotations/scenes/' + dir + '/' + label, 'r') as f:
            scene_annotations = f.readline().strip('\n').split(',')

            with open('BBC/labels/annotations/shots/' + label, 'r') as h:
                shot_annotations = h.readlines()
                shot_annotations = [annot.strip('\n').split('\t') for annot in shot_annotations]

            new_annotations = []
            for i in range(len(scene_annotations)-1):
                ranges = [shot_annotations[int(scene_annotations[i])][0], shot_annotations[int(scene_annotations[i+1]) - 1][1]]
                new_annotations.append(ranges)

            if len(shot_annotations) != int(scene_annotations[-1]):
                new_annotations.append([shot_annotations[int(scene_annotations[-1])][0], shot_annotations[-1][1]])

            with open('BBC/labels/new/scenes/'+ dir + '/' + label[:-4] + '.csv', 'w') as f:
                contents = []
                for annot in new_annotations:
                    contents.append(f"{annot[0]},{annot[1]}\n")
                f.writelines(contents)

In [56]:
BBC = [i for i in os.listdir('features/BBC') if i not in ['.ipynb_checkpoints']]
lengths = []
for i, name in enumerate(BBC):
    with h5py.File(f"features/BBC/{name}", 'r') as f:
        lengths.append(f['labels/annotator_4'].shape)
lengths

[(46, 2),
 (40, 2),
 (45, 2),
 (50, 2),
 (47, 2),
 (54, 2),
 (55, 2),
 (42, 2),
 (39, 2),
 (38, 2),
 (48, 2)]

In [39]:
OVSD = [i for i in os.listdir('features/OVSD') if i not in ['.ipynb_checkpoints']]
lengths = []
for i, name in enumerate(OVSD):
    with h5py.File(f"features/OVSD/{name}", 'r') as f:
        lengths.append(f['features'].shape[0])
max(lengths)

30974