## Prepare data

In this notebook we will be generically preparing data... from minio!!

In [1]:
MINIO_SERVICE='minio.default.svc.cluster.local:9000'
MINIO_ACCESS_KEY='self2face'
MINIO_SECRET_KEY='self2face'

In [2]:
get_ipython().run_line_magic("env", "AWS_ACCESS_KEY_ID=self2face")
get_ipython().run_line_magic("env", "AWS_SECRET_ACCESS_KEY=self2face")
get_ipython().run_line_magic("env", "S3_ENDPOINT=minio.default.svc.cluster.local:9000")
get_ipython().run_line_magic("env", "AWS_ENDPOINT_URL=http://minio.default.svc.cluster.local:9000")
get_ipython().run_line_magic("env", "S3_USE_HTTPS=0")
get_ipython().run_line_magic("env", "S3_VERIFY_SSL=0")
get_ipython().run_line_magic("env", "BUCKET_NAME=test-data")

env: AWS_ACCESS_KEY_ID=self2face
env: AWS_SECRET_ACCESS_KEY=self2face
env: S3_ENDPOINT=minio.default.svc.cluster.local:9000
env: AWS_ENDPOINT_URL=http://minio.default.svc.cluster.local:9000
env: S3_USE_HTTPS=0
env: S3_VERIFY_SSL=0
env: BUCKET_NAME=test-data


In [3]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [4]:
import os
import re
import io
import glob
import enum
import tarfile
from pathlib import Path

import cv2
import numpy as np
from PIL import Image
from minio import Minio
from minio.error import ResponseError
import tensorflow_datasets.public_api as tfds

In [5]:
# servicename.namespace.svc.cluster.local
# minio_client = Minio(os.environ["S3_ENDPOINT"],
#                      access_key=os.environ["AWS_ACCESS_KEY_ID"],
#                      secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
#                      secure=False)

# buckets = minio_client.list_buckets()

# for bucket in buckets:
#     print(bucket.name, bucket.creation_date)
#     print(list(map(get_name, minio_client.list_objects_v2("test-data", '/', recursive=True))))

# minio_client.bucket_exists("test-data")

# get_name = lambda object: object.object_name
# names = map(get_name, client.list_objects_v2("test-data", '/', recursive=True))
# for err in client.remove_objects("test-data", names):
#     print("Deletion Error: {}".format(err))
# client.remove_bucket("test-data")

# client.make_bucket("test-data")

# client.fput_object('test-data','asdf.jpg','./2002/08/11/big/img_591.jpg')

## Utility functions

In [6]:
normalize_string_line = lambda line:str(line.decode('utf-8').strip())

In [7]:
def convert(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

In [8]:
def get_file_from_tar(filepath, tarpath):
    '''Given a tarpath extract a file from tar'''
    return tarfile.open(tarpath).extractfile(filepath)

In [9]:
def file_to_image(fileobj):
    '''Given a file obj, attempt to create an Image'''
    nparr = np.frombuffer(fileobj.read(), np.uint8)
    img_np = cv2.cvtColor(cv2.imdecode(nparr, 1), cv2.COLOR_BGR2RGB)
    return Image.fromarray(img_np)

In [10]:
def latest_dir(input_dir):
    return max(glob.glob(os.path.join(input_dir, '*/')), key=os.path.getmtime)

In [11]:
def search_download_path(url, base_dir="./data"):
    base_download_filename = os.path.basename(url)
    for (dirpath, dirnames, filenames) in os.walk(base_dir):
        if base_download_filename in filenames:
            downloaded_filepath = os.path.join(dirpath, filenames[0])
            break
    return downloaded_filepath

## FDDB specific functions

In [12]:
def image_ellipse_to_box(image, major_axis_radius, minor_axis_radius, angle, center_x, center_y, detection_score):
    '''Given a PIL image and ellipse information, return a dict with bounding box information'''
    
    imagew=image.size[1]
    imageh=image.size[0]
    box = dotdict(
        x=1.0*center_x/imagew,
        y=1.0*center_y/imageh,
        w=1.0*minor_axis_radius*2/imagew,
        h=1.0*major_axis_radius*2/imageh,
        category=0
    )
    
    if box.w>0 and box.h>0 and box.x-box.w/2>=0 and\
       box.y-box.h/2>=0 and box.x+box.w/2<=1 and box.y+box.h/2<=1:
        return box
    else:
        return False

## Dataset!

In [13]:
class FDDB_Dataset(tfds.core.GeneratorBasedBuilder):
    """Short description of my dataset."""
    
    class FDDB_Parse_State(enum.Enum):
        '''States of fddb dataset annotations'''
        FILEPATH = 1
        NUMFACES = 2
        FACELOCATION = 3
        
    FDDB_BUCKET_NAME = "fddb"
        
    FDDB_DOWNLOAD_URLS={
        "images":"http://tamaraberg.com/faceDataset/originalPics.tar.gz",
        "annotations":"http://vis-www.cs.umass.edu/fddb/FDDB-folds.tgz"
    }

    VERSION = tfds.core.Version('0.0.0')
    
    def __init__(self, download_local, *args, **kwargs):
        super(FDDB_Dataset, self).__init__(*args, *kwargs)
        self.download_local = download_local
        self.minio_client = self.make_minio_client()
        self.data_count = 0

    def _info(self):
        return tfds.core.DatasetInfo(
            builder=self,
            features=tfds.features.FeaturesDict({
                "image": tfds.features.Image(),
                "bbox": tfds.features.BBoxFeature()
            }),
            supervised_keys=("image", "bbox")
        )

    def _split_generators(self, dl_manager):
        try:
            dl_paths = dl_manager.download(self.FDDB_DOWNLOAD_URLS)
        except tfds.download.download_manager.NonMatchingChecksumError:
            pass
        
        return self.extract_and_upload()
        
    def _generate_examples(self):
        # Yields examples from the dataset
        pass  # TODO        

    def make_minio_client(self, **kwargs):
        return Minio(os.environ["S3_ENDPOINT"],
                     access_key=os.environ["AWS_ACCESS_KEY_ID"],
                     secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
                     secure=False,
                     **kwargs)
    
    def upload_file_to_minio(self, bucket_name, objname, fileobj, size):
        '''Wrapper for uploading a file opject to minio'''
        try:
            if not self.minio_client.bucket_exists(bucket_name):
                self.minio_client.make_bucket(bucket_name)
        except ResponseError:
            pass        
        
        return self.minio_client.put_object(bucket_name, objname, fileobj, size)
    
    def generate_fddb_json_annotations(self, annotations_file_content, images_tarfile, image_file_extension=".jpg"):
        '''Generator of json annotations for fddb dataset'''

        ## Define base dict, state variable, and first line
        base_json_annotation = dotdict()
        current_state = self.FDDB_Parse_State.FILEPATH
        line = normalize_string_line(annotations_file_content.readline())

        ## Iter line until empty file
        while line:

            if current_state == self.FDDB_Parse_State.FILEPATH:
                image = file_to_image(images_tarfile.extractfile(line + image_file_extension))
                base_json_annotation['file'] = line + image_file_extension
                base_json_annotation['imagew'] = image.size[1]
                base_json_annotation['imageh'] = image.size[0]            
                current_state = self.FDDB_Parse_State.NUMFACES

            elif current_state == self.FDDB_Parse_State.NUMFACES:
                face_locations = []
                num_faces = int(line)
                current_state = self.FDDB_Parse_State.FACELOCATION

            elif current_state == self.FDDB_Parse_State.FACELOCATION:
                if num_faces > 0:
                    face_location_args = map(float,line.split())
                    bbox = image_ellipse_to_box(image, *face_location_args)
                    if bbox: face_locations.append(bbox)
                    num_faces -= 1
                else:
                    if len(face_locations):
                        yield dotdict({
                            **base_json_annotation, 
                            **{"face_locations":face_locations}})                
                    current_state = self.FDDB_Parse_State.FILEPATH
                    continue

            line = normalize_string_line(annotations_file_content.readline())
            
    def extract_and_upload(self):

        images_tarfile_path = search_download_path(self.FDDB_DOWNLOAD_URLS["images"])
        images_tarfile = tarfile.open(images_tarfile_path)
        annotations_tarfile_path = search_download_path(self.FDDB_DOWNLOAD_URLS["annotations"])
        annotations_tarfile = tarfile.open(annotations_tarfile_path)
        
        for annotation_file in filter(lambda tfn:"ellipse" in tfn.name, annotations_tarfile):
            for fddb_json_annotations in self.generate_fddb_json_annotations(annotations_tarfile.extractfile(annotation_file), images_tarfile):
                if not self.download_local:
                    image_response = self.upload_file_to_minio(bucket_name = self.FDDB_BUCKET_NAME, 
                                                               objname = fddb_json_annotations["file"],
                                                               fileobj = get_file_from_tar(fddb_json_annotations["file"], images_tarfile_path),
                                                               size = images_tarfile.getmember(fddb_json_annotations["file"]).size)
                    annotation_response = self.upload_file_to_minio(bucket_name = self.FDDB_BUCKET_NAME, 
                                                                    objname = str(Path(fddb_json_annotations["file"]).with_suffix(".json")),
                                                                    fileobj = io.BytesIO(str(fddb_json_annotations).encode()),
                                                                    size = len(str(fddb_json_annotations).encode()))
                    print(image_response, annotation_response)
                    self.data_count += 1
                    
                else:
                    pass

In [None]:
my_builder = tfds.builder(convert(FDDB_Dataset.__name__), download_local=False)
my_builder.download_and_prepare(
    download_dir=os.path.join(os.getcwd(),"data/"))

[1mDownloading and preparing dataset fddb__dataset (?? GiB) to /home/jovyan/tensorflow_datasets/fddb__dataset/0.0.0...[0m


HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Completed...', max=1, style=ProgressStyl…

HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Size...', max=1, style=ProgressStyle(des…


196c82b1ee5668c7efdba5d669084bd5 2b9838bb8d6e666060820061439c5033
d5a60ea3268460489582852dced0033c 0b08cc4fe8dacf26d246bb95ea06f1a0
4399258a4b58526e58c1f36bc7e08034 7f41ab7a76b21869ba89dc2dc1c2b99a
79bfff0cad523e69b952de4328045178 bcc722aca15d7f50e589afc64c50bb6a
d60f086245b23dcbde85cfdea29eeb00 8659c42484a3bd1d849a7c53a2864df1


In [19]:
minio_client = Minio(os.environ["S3_ENDPOINT"],
                     access_key=os.environ["AWS_ACCESS_KEY_ID"],
                     secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
                     secure=False)

In [27]:
[i.object_name for i in list(minio_client.list_objects_v2('fddb', recursive=True))][:10]

['2002/07/19/big/img_423.jpg',
 '2002/07/19/big/img_581.jpg',
 '2002/07/23/big/img_474.jpg',
 '2002/07/24/big/img_402.jpg',
 '2002/07/24/big/img_518.jpg',
 '2002/07/27/big/img_970.jpg',
 '2002/07/31/big/img_228.jpg',
 '2002/08/04/big/img_769.jpg',
 '2002/08/07/big/img_1316.jpg',
 '2002/08/11/big/img_591.jpg']

In [33]:
minio_client.fget_object('fddb', '2002/07/19/big/img_423.jpg', "./a.jpg")

<urllib3.response.HTTPResponse at 0x7fc5c503eeb8>