## Prepare data

In this notebook we will be generically preparing data... from minio!!

In [1]:
MINIO_SERVICE='minio.default.svc.cluster.local:9000'
MINIO_ACCESS_KEY='self2face'
MINIO_SECRET_KEY='self2face'

In [2]:
get_ipython().run_line_magic("env", "AWS_ACCESS_KEY_ID=self2face")
get_ipython().run_line_magic("env", "AWS_SECRET_ACCESS_KEY=self2face")
get_ipython().run_line_magic("env", "S3_ENDPOINT=minio.default.svc.cluster.local:9000")
get_ipython().run_line_magic("env", "AWS_ENDPOINT_URL=http://minio.default.svc.cluster.local:9000")
get_ipython().run_line_magic("env", "S3_USE_HTTPS=0")
get_ipython().run_line_magic("env", "S3_VERIFY_SSL=0")
get_ipython().run_line_magic("env", "BUCKET_NAME=test-data")

env: AWS_ACCESS_KEY_ID=self2face
env: AWS_SECRET_ACCESS_KEY=self2face
env: S3_ENDPOINT=minio.default.svc.cluster.local:9000
env: AWS_ENDPOINT_URL=http://minio.default.svc.cluster.local:9000
env: S3_USE_HTTPS=0
env: S3_VERIFY_SSL=0
env: BUCKET_NAME=test-data


In [3]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [4]:
import os
import glob
import enum
import tarfile

import cv2
import numpy as np
from PIL import Image
from minio import Minio
import tensorflow_datasets.public_api as tfds

In [5]:
# servicename.namespace.svc.cluster.local
client = Minio('minio.default.svc.cluster.local:9000',
               access_key='self2face',
               secret_key='self2face',
               secure=False)

buckets = client.list_buckets()

for bucket in buckets:
    print(bucket.name, bucket.creation_date)
    print(list(map(get_name, client.list_objects_v2("test-data", '/', recursive=True))))

test-data 2019-05-31 03:58:41.702000+00:00


NameError: name 'get_name' is not defined

In [None]:
# get_name = lambda object: object.object_name
# names = map(get_name, client.list_objects_v2("test-data", '/', recursive=True))
# for err in client.remove_objects("test-data", names):
#     print("Deletion Error: {}".format(err))
# client.remove_bucket("test-data")

# client.make_bucket("test-data","us-east-1")
# client.fput_object('test-data','asdf.jpg','./data/extracted/TAR_GZ.originalPics.tar.gz/2002/07/19/big/img_130.jpg')

## Utility functions

In [6]:
normalize_string_line = lambda line:str(line.decode('utf-8').strip())

In [7]:
def get_file_from_tar(filepath, tarpath):
    '''Given a tarpath extract a file from tar'''
    tf = tarfile.open(tarpath)
    return tf.extractfile(filepath)

In [8]:
def file_to_image(fileobj):
    '''Given a file obj, attempt to create an Image'''
    nparr = np.fromstring(fileobj.read(), np.uint8)
    img_np = cv2.cvtColor(cv2.imdecode(nparr, 1), cv2.COLOR_BGR2RGB)
    return Image.fromarray(img_np)

In [9]:
def latest_dir(input_dir):
    return max(glob.glob(os.path.join(input_dir, '*/')), key=os.path.getmtime)

In [10]:
def search_download_path(url, base_dir="./data"):
    base_download_filename = os.path.basename(url)
    for (dirpath, dirnames, filenames) in os.walk(base_dir):
        if base_download_filename in filenames:
            downloaded_filepath = os.path.join(dirpath, filenames[0])
            break
    return downloaded_filepath

## FDDB specific functions

In [11]:
def image_ellipse_to_box(image, major_axis_radius, minor_axis_radius, angle, center_x, center_y, detection_score):
    '''Given a PIL image and ellipse information, return a dict with bounding box information'''
    
    imagew=image.size[1]
    imageh=image.size[0]
    box = dotdict(
        x=1.0*center_x/imagew,
        y=1.0*center_y/imageh,
        w=1.0*minor_axis_radius*2/imagew,
        h=1.0*major_axis_radius*2/imageh,
        category=0
    )
    
    if box.w>0 and box.h>0 and box.x-box.w/2>=0 and\
       box.y-box.h/2>=0 and box.x+box.w/2<=1 and box.y+box.h/2<=1:
        return box
    else:
        print("Invalid position removed "+str(box.x)+" "+str(box.y)+" "+str(box.w)+" "+str(box.h))
        return False

## Dataset!

In [12]:
class MyDataset(tfds.core.GeneratorBasedBuilder):
    """Short description of my dataset."""
    
    class FDDB_Parse_State(enum.Enum):
        '''States of fddb dataset annotations'''
        FILEPATH = 1
        NUMFACES = 2
        FACELOCATION = 3
        
    FDDB_DOWNLOAD_URLS={
        "images":"http://tamaraberg.com/faceDataset/originalPics.tar.gz",
        "annotations":"http://vis-www.cs.umass.edu/fddb/FDDB-folds.tgz"
    }

    VERSION = tfds.core.Version('0.0.0')
    
    def __init__(self, download_local, *args, **kwargs):
        super(MyDataset, self).__init__(*args, *kwargs)
        self.download_local = download_local
        
    def _info(self):
        return tfds.core.DatasetInfo(
            builder=self,
            features=tfds.features.FeaturesDict({
                "image": tfds.features.Image(),
                "bbox": tfds.features.BBoxFeature()
            }),
            supervised_keys=("image", "bbox")
        )
    
    def generate_fddb_json_annotations(self, annotations_file_content, images_tarfile):
        '''Generator of json annotations for fddb dataset'''

        ## Define base dict, state variable, and first line
        base_json_annotation = dotdict()
        current_state = self.FDDB_Parse_State.FILEPATH
        line = normalize_string_line(annotations_file_content.readline())

        ## Iter line until empty file
        while line:

            if current_state == self.FDDB_Parse_State.FILEPATH:
                image = file_to_image(images_tarfile.extract(line + ".jpg"))
                base_json_annotation['file'] = line + ".jpg"
                base_json_annotation['imagew'] = image.size[1]
                base_json_annotation['imageh'] = image.size[0]            
                current_state = self.FDDB_Parse_State.NUMFACES

            elif current_state == self.FDDB_Parse_State.NUMFACES:
                face_locations = []
                num_faces = int(line)
                current_state = self.FDDB_Parse_State.FACELOCATION

            elif current_state == self.FDDB_Parse_State.FACELOCATION:
                if num_faces > 0:
                    face_location_args = map(float,line.split())
                    bbox = image_ellipse_to_box(image, *face_location_args)
                    if bbox: face_locations.append(bbox)
                    num_faces -= 1
                else:
                    if len(face_locations):
                        yield dotdict({
                            **base_json_annotation, 
                            **{"face_locations":face_locations}})                
                    current_state = self.FDDB_Parse_State.FILEPATH
                    continue

            line = normalize_string_line(annotations_file_content.readline())
            
    def extract_and_upload(self):

        images_tf = tarfile.open(search_download_path(self.FDDB_DOWNLOAD_URLS["images"]))
        annotations_tf = tarfile.open(search_download_path(self.FDDB_DOWNLOAD_URLS["annotations"]))
        
        for annotation_f in filter(lambda tfn:"ellipse" in tfn.name, annotations_tf):
            for box in self.generate_fddb_json_annotations(annotations_tf.extractfile(annotation_f), images_tf):
                print(box)

    def _split_generators(self, dl_manager):
        try:
            dl_paths = dl_manager.download(self.FDDB_DOWNLOAD_URLS)
        except tfds.download.download_manager.NonMatchingChecksumError:
            pass
        
        return self.extract_and_upload()
        
    def _generate_examples(self):
        # Yields examples from the dataset
        pass  # TODO

In [13]:
my_builder = tfds.builder("my_dataset", download_local=False)
my_builder.download_and_prepare(
    download_dir=os.path.join(os.getcwd(),"data/"))

[1mDownloading and preparing dataset my_dataset (?? GiB) to /home/jovyan/tensorflow_datasets/my_dataset/0.0.0...[0m


HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Completed...', max=1, style=ProgressStyl…

HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Size...', max=1, style=ProgressStyle(des…




AttributeError: 'NoneType' object has no attribute 'read'

In [48]:
images = "./data/tamaraberg.com_faceDataset_originalPics7kj66XsjMzQfGOnPP0c-LCTXdvVfD-_408NdmTA40GA.tar.gz.tmp.6188fcaf52904d60a9f8bc8c6d5530af/originalPics.tar.gz"
annotations = "./data/FDDB-folds.tgz"

In [49]:
tf = tarfile.open(annotations)
for f in tf:
    if "ellipse" not in f.name:
        continue
                
    for box in generate_fddb_json_annotations(tf.extractfile(f), images):
        print(box)

  This is separate from the ipykernel package so we can avoid doing imports until


{'file': '2002/08/11/big/img_591.jpg', 'imagew': 431, 'imageh': 450, 'face_locations': [{'x': 0.625738747099768, 'y': 0.3595137777777778, 'w': 0.39698143851508116, 'h': 0.549259111111111, 'category': 0}]}
Invalid position removed 1.1176862295081966 0.2616903140311804 0.284296393442623 0.3162274031180401
{'file': '2002/08/26/big/img_265.jpg', 'imagew': 305, 'imageh': 449, 'face_locations': [{'x': 0.345081868852459, 'y': 0.1942294788418708, 'w': 0.2918785901639344, 'h': 0.3000615545657016, 'category': 0}, {'x': 0.6035111967213115, 'y': 0.28807483518930954, 'w': 0.17747198032786884, 'h': 0.18680120267260578, 'category': 0}]}
{'file': '2002/07/19/big/img_423.jpg', 'imagew': 450, 'imageh': 449, 'face_locations': [{'x': 0.5675179977777778, 'y': 0.29792395768374164, 'w': 0.26390808444444447, 'h': 0.38788844097995545, 'category': 0}]}
{'file': '2002/08/24/big/img_490.jpg', 'imagew': 450, 'imageh': 370, 'face_locations': [{'x': 0.32370154222222225, 'y': 0.2110837972972973, 'w': 0.15580811111111

KeyboardInterrupt: 