This script transform raw imagery (in not_a_dam_images and dam_images folders) to Darknet-formatted inputs for YOLOv3 (link to implementation Maanas?)

Outputs in 2 directories:
* images: has filename.png
* labels: filename.txt : class, center x, center y, width, height (x y w h are all normalized (between 0 and 1) relative to image dimensions)

In [10]:
# import required libraries

import os
import shutil
import zipfile
import json
import numpy as np
import shapely
import random
from matplotlib import image as mpimg

import gdal

import requests
## Useless?
from osgeo import gdal
import ogr
import shapely.wkb
import shapely.prepared

import logging
LOGGER = logging.getLogger()
LOGGER.setLevel(logging.DEBUG)
LOGGER.info("Logger in INFO mode")
LOGGER.debug("Logger in DEBUG mode")
LOGGER.debug("Logger in DEBUG mode")

REQUEST_TIMEOUT = 1.0

#### Dataset paths

In [3]:
##Paths

path_to_charlie_root = "../../.."
NOT_A_DAM_IMAGE_DIR = os.path.join(path_to_charlie_root,"data/imagery-6-7-2019/not_a_dam_images")
DAM_IMAGE_DIR = os.path.join(path_to_charlie_root,"data/imagery-6-7-2019/dam_images")

TM_WORLD_BORDERS_URL = 'https://storage.googleapis.com/ecoshard-root/ipbes/TM_WORLD_BORDERS_SIMPL-0.3_md5_15057f7b17752048f9bd2e2e607fe99c.zip'

if not os.path.exists(NOT_A_DAM_IMAGE_DIR):
    raise ValueError("can't find %s'" % NOT_A_DAM_IMAGE_DIR)
if not os.path.exists(DAM_IMAGE_DIR):
    raise ValueError("can't find %s'" % DAM_IMAGE_DIR)
    
#OUTPUTS_DIR = os.path.join(path_to_charlie_root,"data/YOLOready_imagery_6-7_made_6-21")
OUTPUTS_DIR = os.path.join(path_to_charlie_root,"data/YOLOready_imagery_6-7_test")
WORKSPACE_DIR = OUTPUTS_DIR

Run just one of the cells below

In [4]:
## Subsets of data inputs - for faster development purposes

dam_file_list = [os.path.join(DAM_IMAGE_DIR, f)
                 for f in os.listdir(DAM_IMAGE_DIR) if f.endswith('5140_clipped.png')]
not_a_dam_file_list = [os.path.join(NOT_A_DAM_IMAGE_DIR, f)
                       for f in os.listdir(NOT_A_DAM_IMAGE_DIR) if f.endswith('362_not_a_dam.png')]

In [None]:
## Full dataset 

dam_file_list = [os.path.join(DAM_IMAGE_DIR, f)
                 for f in os.listdir(DAM_IMAGE_DIR) if f.endswith('clipped.png')]
not_a_dam_file_list = [os.path.join(NOT_A_DAM_IMAGE_DIR, f)
                       for f in os.listdir(NOT_A_DAM_IMAGE_DIR) if f.endswith('not_a_dam.png')]

In [5]:
len(dam_file_list)

2

In [6]:
len(not_a_dam_file_list)

7

Do run this one to merge dam_list and not_a_dam_list

In [46]:
all_images_file_list = dam_file_list+not_a_dam_file_list

random.seed(0)
random.shuffle(all_images_file_list)

len(all_images_file_list)


9

#### Parameters

In [8]:
holdout_set_portion = .2
validation_set_portion = .15
Dams_per_round = 5#1000 # = max_dams_per_record 

#### Get South Africa geometry

In [23]:
def download_url_to_file(url, target_file_path):
    """Use requests to download a file.

    Parameters:
        url (string): url to file.
        target_file_path (string): local path to download the file.

    Returns:
        None.

    """
    try:
        response = requests.get(url, stream=True, timeout=REQUEST_TIMEOUT)
        try:
            os.makedirs(os.path.dirname(target_file_path))
        except OSError:
            pass
        with open(target_file_path, 'wb') as target_file:
            shutil.copyfileobj(response.raw, target_file)
        del response
    except:
        LOGGER.exception('download of {url} to {target_file_path} failed')
        # mods from LOGGER.exception(f'download of {url} to {target_file_path} failed')
        raise

        
        
        
tm_world_borders_zip_path = os.path.join(
        WORKSPACE_DIR, 'world_borders', os.path.basename(TM_WORLD_BORDERS_URL))
if not os.path.exists(tm_world_borders_zip_path):
    download_url_to_file(TM_WORLD_BORDERS_URL, tm_world_borders_zip_path)
    with zipfile.ZipFile(tm_world_borders_zip_path, 'r') as zip_ref:
        zip_ref.extractall(os.path.join(WORKSPACE_DIR,'world_borders'))

tm_world_borders_vector_path = os.path.join(
    WORKSPACE_DIR,'world_borders', 'TM_WORLD_BORDERS-0.3.shp')

tm_world_borders_vector = ogr.Open(tm_world_borders_vector_path)
tm_world_borders_layer = tm_world_borders_vector.GetLayer()
for border_feature in tm_world_borders_layer:
    if border_feature.GetField('NAME') == 'South Africa':
        sa_geom = border_feature.GetGeometryRef()
        sa_geom_prep = shapely.prepared.prep(
            shapely.wkb.loads(sa_geom.ExportToWkb()))
        break
LOGGER.debug(sa_geom_prep)

# Now Make YOLO-ready data !

Utils: Function to make YOLO_ready data

In [54]:
def Make_Darknet_inputs(images_file_list, iteration): 
    
    """ 
    
    Takes in folder of dam pngs, folder of bounding box json files
    Normalizes json data to darknet format (center x, center y, bbox width, bbox height)
    Creates new directories in darknet format
    
    """
    
    print('Starting to make YOLO-ready data, round %d' % iteration)
    
    random.seed(iteration)
    random_numbers_list = [random.random() for x in range(0, len(images_file_list))]
    random_number_iterator = 0
    
    for image_path in images_file_list:
        
        # read in image
        img = mpimg.imread(image_path)

        # get width and height
        img_w = img.shape[0]
        img_h = img.shape[1]

        # get matching bounding box json file
        json_path = image_path.replace('.png', '.json')
        if not os.path.exists(json_path):
            raise NameError("can't find bbox for %s" % json_path)
            
        # read json bounding box coordinates
        with open(json_path, 'r') as json_file: 
            image_metadata = json.load(json_file)
        
        # normalize to x-center, y-center, width, and height of bbox
        coords = image_metadata['pixel_bounding_box']
        avg_x = (coords[2] + coords[0]) / (2 * img_w)
        avg_y = (coords[1] + coords[3]) / (2 * img_h)
        nrm_w = (coords[2] - coords[0]) / img_w
        nrm_h = (coords[1] - coords[3]) / img_h
        nrm_xywh = np.array([avg_x, avg_y, nrm_w, nrm_h])

        # Define new label in YOLO format
        if 'not_a_dam' in image_path:
            dam_type = 'not_a_dam'
            label_str = ''
        else:
            dam_type = 'dam'
            label_str = '0 ' + str('%.6f'%nrm_xywh[0]) + ' ' + str('%.6f'%nrm_xywh[1]) + ' ' + str('%.6f'%nrm_xywh[2]) + ' ' + str('%.6f'%nrm_xywh[3])

            
            
        # - - -   - - -   - - -   
        # Choose whether this record will go to training or validation (=dev) set 
        try:
            centroid = image_metadata['lng_lat_centroid']
        except NameError:
            raise Exception("Missing lat/lon for in file", json_path)
            
            
        random_number = random_numbers_list[random_number_iterator]
        random_number_iterator+=1
        
        if sa_geom_prep.contains(shapely.geometry.Point(centroid[0], centroid[1])): # both for dams & not_a_dams
            writer = 'southaf_set'
            log = southaf_log
        elif random_number < holdout_set_portion:
            writer = 'test_set'
            log = test_log
        elif random_number > (1-validation_set_portion):
            writer = 'validation_set'
            log = validation_log
        else:
            writer = 'training_set'
            log = training_log
            
        # Write the file in the corresponding set
        
        ## Write image here:
        filename = image_path.split("images/")[1].replace('.png','')
            
        newimg_filepath = os.path.join(OUTPUTS_DIR,writer,'images',filename+'.png')
        shutil.copyfile(image_path, newimg_filepath)
        
        
        newtext_filepath = os.path.join(OUTPUTS_DIR,writer,'labels',filename+'.txt')
        file = open(newtext_filepath, 'w')
        file.write(label_str)
        file.close()

        # Add stats 
        log[dam_type] += 1
        
    return training_log, validation_log, test_log, southaf_log

Full loop

In [48]:
# Make workspace outputs directories doesn't exist

directories_to_make = [WORKSPACE_DIR,
                       os.path.join(WORKSPACE_DIR,'training_set'),
                       os.path.join(WORKSPACE_DIR,'training_set','labels'),
                       os.path.join(WORKSPACE_DIR,'training_set','images'),
                       os.path.join(WORKSPACE_DIR,'validation_set'),
                       os.path.join(WORKSPACE_DIR,'validation_set','labels'),
                       os.path.join(WORKSPACE_DIR,'validation_set','images'),
                       os.path.join(WORKSPACE_DIR,'test_set'),
                       os.path.join(WORKSPACE_DIR,'test_set','labels'),
                       os.path.join(WORKSPACE_DIR,'test_set','images'),
                       os.path.join(WORKSPACE_DIR,'southaf_set'),
                      os.path.join(WORKSPACE_DIR,'southaf_set','labels'),
                      os.path.join(WORKSPACE_DIR,'southaf_set','images')]
for directory in directories_to_make:
    try:
        os.mkdir(directory)
        
    except OSError:
        pass

In [55]:
# Do the thing    
training_log = {'dam': 0, 'not_a_dam': 0}
validation_log = {'dam': 0, 'not_a_dam': 0}
test_log = {'dam': 0, 'not_a_dam': 0}
southaf_log = {'dam': 0, 'not_a_dam': 0}

#last_time = time.time()

iteration = 0
max_iteration = int(len(all_images_file_list)/Dams_per_round)

while iteration <= max_iteration:
    
    LOGGER.info('iteration %d' % iteration)

    # Get a slice of the dataset
    slice_dam_images_list = all_images_file_list[iteration*Dams_per_round:min((iteration+1)*Dams_per_round,len(all_images_file_list))]

    #This is where I make Darknet_ready_inputs!
    training_log, validation_log, test_log, southaf_log = Make_Darknet_inputs(slice_dam_images_list, iteration)

    # Advance loop
    iteration += 1

    print('training_log',training_log)
    print('validation_log',validation_log)
    print('test_log',test_log)
    print('southaf_log',southaf_log)
    print('\n')


Starting to make YOLO-ready data, round 0
training_log {'dam': 1, 'not_a_dam': 4}
validation_log {'dam': 0, 'not_a_dam': 0}
test_log {'dam': 0, 'not_a_dam': 0}
southaf_log {'dam': 0, 'not_a_dam': 0}


Starting to make YOLO-ready data, round 1
training_log {'dam': 2, 'not_a_dam': 6}
validation_log {'dam': 0, 'not_a_dam': 0}
test_log {'dam': 0, 'not_a_dam': 1}
southaf_log {'dam': 0, 'not_a_dam': 0}


