# Example 3: Extract and dump image data

In this example, we will learn how to extract and dump image data.  

In [1]:
from dwtk.db import V3DBHandler as DBHandler

db_handler = DBHandler(
    db_class='meta',
    db_host='/data_pool_1/small_DrivingBehaviorDatabase/dwtk.db',
    base_dir_path='/data_pool_1/small_DrivingBehaviorDatabase',
    read_on_init=False
)
db_handler.read(where=
                'tags like "%camera%" '
                'and tags like "%image%" '
                'and tags like "%front%" '
                'and tags like "%center%" '
                'and record_id like "%016_00000000030000000240%"')
print('# of metadata: {}'.format(len(db_handler.df)))

# of metadata: 1


For now, we only collect images of one trip as we just want to see the workflow.

In [2]:
sample = next(db_handler)
sample

{'description': 'Driving Database',
 'database_id': 'Driving Behavior Database',
 'record_id': '016_00000000030000000240',
 'data_type': 'raw_data',
 'path': '/data_pool_1/small_DrivingBehaviorDatabase/records/016_00000000030000000240/data/camera_01.mp4',
 'start_timestamp': 1489728491.0,
 'end_timestamp': 1489728570.957,
 'content_type': 'video/mp4',
 'contents': 'camera/front-center',
 'msg_type': None,
 'msg_md5sum': None,
 'count': None,
 'frequency': None,
 'tags': ['camera', 'front', 'center', 'image']}

The straightforward way is to load a video file and save image file frame-by-frame.
But in this method, as all the frames will be loaded from the file at first,
it takes a long time until the process of the first frame begins as follows.  

In [3]:
import time
from PIL import Image
from tqdm.notebook import tqdm
from dwtk.io import BaseFileReader

reader = BaseFileReader()

t1 = time.time()
timestamps, data, columns = reader.read(sample, target_frame_rate=2, resize_rate=0.25)
t2 = time.time()
print("Time (Method 1): {0:.03f}".format(t2 - t1))

for timestamp, frame in tqdm(zip(timestamps, data)):
    image_array = frame[:, :, ::-1]     # Convert BGR to RGB
    image = Image.fromarray(image_array)
    image.save('dump/{0}-{1:.03f}.jpg'.format(sample['record_id'], timestamp))

Failed to load Python extension for LZ4 support. LZ4 compression will not be available.


Time (Method 1): 68.652


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In addition, the process may die due to lack in free space in the memory 
as all the data loaded from the file will be stored in the memory.

The following method avoids the problems above by defining a function that
splits the data to load into short sequences and load them one-by-one.

In [4]:
from copy import deepcopy
import time

from PIL import Image
from dwtk.io import BaseFileReader


def load_images(metadata, segment_duration, **kwargs):
    """Load images.

    Args:
        metadata (dict): metadata
        segment_duration (float): duration of each segment

    Returns:
        pass

    """
    assert 'start_timestamp' in metadata.keys()
    assert 'end_timestamp' in metadata.keys()

    duration = metadata['end_timestamp'] - metadata['start_timestamp']

    for idx in range(int(duration // segment_duration)):
        _metadata = deepcopy(metadata)
        _metadata.update({
            'start_timestamp': metadata['start_timestamp'] + segment_duration * idx,
            'end_timestamp': metadata['start_timestamp'] + segment_duration * (idx + 1)
        })
        reader = BaseFileReader()
        yield reader.read(_metadata, **kwargs)
        

# Load video and save images
target_frame_rate = 2
seq_duration = 10
num_seq = int((sample['end_timestamp'] - sample['start_timestamp']) // seq_duration)
t3 = time.time()
for timestamps, data, columns in tqdm(
        load_images(sample, segment_duration=seq_duration, target_frame_rate=target_frame_rate, resize_rate=0.25), 
        total=num_seq
    ):
    for timestamp, frame in zip(timestamps, data):
        image_array = frame[:, :, ::-1]     # Convert BGR to RGB
        image = Image.fromarray(image_array)
        image.save('dump/{0}-{1:.03f}.jpg'.format(sample['record_id'], timestamp))
t4 = time.time()
print("Time (Method 2): {0:.03f}".format(t4 - t3))

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


Time (Method 2): 59.327
