In [1]:
import os
import boto3
import json

In [2]:
# AWS S3 parameters
# You need an aws profile set up with your credentials in your aws CLI configuration file
session = boto3.Session(profile_name='mturk')
s3_client = session.client('s3')
s3_bucket = 'video-face-reco-dataset'

In [10]:
def download_videos(path_directory='', your_current_version='0'):
    """
    Load the videos of the dataset from S3 to the local directory

    :param path_directory: the directory where the videos will be downloaded
    :param your_current_version: 
        the current version of the dataset from which you have already downloaded videos.
        If 0: download all videos / Otherwise: download only new videos from this version
    """
    # We load the video file names of the dataset
    versions_object = s3_client.get_object(Bucket=s3_bucket, Key='versions.json')
    versions = json.loads(versions_object['Body'].read().decode('utf-8'))
    accepted_versions = ['0'] + list(versions['versions'].keys())
    
    if str(your_current_version) not in accepted_versions:
        raise ValueError(f'Version {your_current_version} does not exist - accepted versions are : {",".join(accepted_versions)}')
        
    # We create the directory if it does not exist
    if not os.path.exists(path_directory):
        os.makedirs(path_directory)
    
    # We download the videos of the dataset
    print(f'*** Downloading videos of the dataset... ***')
    for video_key, version in versions['videos'].items():
        if int(version) > int(your_current_version):
            path_video = os.path.join(path_directory, os.path.basename(video_key))
            print(f' - Downloading {path_video}...')
            s3_client.download_file(Bucket=s3_bucket, Key=video_key, Filename=path_video)
        else:
            print(f' - You already have "{os.path.basename(video_key)}" with your current version {your_current_version}.')

In [9]:
download_videos(path_directory='test_video_download/videos', your_current_version='0')

*** Downloading videos of the dataset... ***
 - Downloading test_video_download/videos/C dans l'air 1.mp4...


In [11]:
def download_version(path_directory='', version='last'):
    """
    Load the required version of the dataset from S3 to the local directory

    :param path_directory: the directory where the dataset will be downloaded
    :param version: the version of the dataset to load - if 'last', the last version will be loaded
    """
    # We load the file names of the dataset for the required version
    versions_object = s3_client.get_object(Bucket=s3_bucket, Key='versions.json')
    versions = json.loads(versions_object['Body'].read().decode('utf-8'))
    last_version = max(versions['versions'].keys())

    if version == "last":
        print(f'Last version: {last_version}')
        version = last_version
    elif str(version) not in versions['versions'].keys():
        raise ValueError(f'Version {version} does not exist - last version is {last_version}')

    path_download = os.path.join(path_directory, f'version_{version}')
    # we create the directory if it does not exist
    if not os.path.exists(path_download):
        os.makedirs(path_download)

    # we download the dataset for the required version
    print(f'*** Downloading version {version} of the dataset... ***')
    for file_key, file_versionId in versions['versions'][str(version)].items():
        path_file = os.path.join(path_download, os.path.basename(file_key))
        print(f' - Downloading {path_file}...')
        s3_client.download_file(
            Bucket=s3_bucket,
            Key=file_key,
            Filename=path_file,
            ExtraArgs={'VersionId': file_versionId}
        )

In [None]:
download_version(path_directory='test_video_download', version="last")