Preparing dam data for darknet format:

This script has two inputs:
    - not_a_dam_images.zip
    - dam_images.zip
and two outputs (directories)
    - images
    - labels

The input zip files should be in the same directory

Each output image file will correspond to a label file, which is in darknet format (class, center x, center y, width, height)

x y w h are all normalized (between 0 and 1) relative to image dimensions

In [6]:
# import required libraries

import os
import shutil
import zipfile as zf
import json
import numpy as np
import shapely
import random
from matplotlib import image as mpimg

import gdal

import requests

import logging
LOGGER = logging.getLogger()
LOGGER.setLevel(logging.DEBUG)
LOGGER.info("Logger in INFO mode")
LOGGER.debug("Logger in DEBUG mode")
LOGGER.debug("Logger in DEBUG mode")

REQUEST_TIMEOUT = 1.0

ImportError: No module named 'gdal'

#### Dataset paths

In [8]:
##Paths

path_to_charlie_root = "../../.."
NOT_A_DAM_IMAGE_DIR = os.path.join(path_to_charlie_root,"data/imagery-6-7-2019/not_a_dam_images")
DAM_IMAGE_DIR = os.path.join(path_to_charlie_root,"data/imagery-6-7-2019/dam_images")

TM_WORLD_BORDERS_URL = 'https://storage.googleapis.com/ecoshard-root/ipbes/TM_WORLD_BORDERS_SIMPL-0.3_md5_15057f7b17752048f9bd2e2e607fe99c.zip'

if not os.path.exists(NOT_A_DAM_IMAGE_DIR):
    raise ValueError("can't find %s'" % NOT_A_DAM_IMAGE_DIR)
if not os.path.exists(DAM_IMAGE_DIR):
    raise ValueError("can't find %s'" % DAM_IMAGE_DIR)
    
OUTPUTS_DIR = os.path.join(path_to_charlie_root,"data/YOLOready_imagery_6-7_made_6-20")
WORKSPACE_DIR = OUTPUTS_DIR

Run just one of the cells below

In [9]:
## Subsets of data inputs - for faster development purposes

dam_file_list = [os.path.join(DAM_IMAGE_DIR, f)
                 for f in os.listdir(DAM_IMAGE_DIR) if f.endswith('5140_clipped.png')]
not_a_dam_file_list = [os.path.join(NOT_A_DAM_IMAGE_DIR, f)
                       for f in os.listdir(NOT_A_DAM_IMAGE_DIR) if f.endswith('362_not_a_dam.png')]

In [None]:
## Full dataset 

dam_file_list = [os.path.join(DAM_IMAGE_DIR, f)
                 for f in os.listdir(DAM_IMAGE_DIR) if f.endswith('clipped.png')]
not_a_dam_file_list = [os.path.join(NOT_A_DAM_IMAGE_DIR, f)
                       for f in os.listdir(NOT_A_DAM_IMAGE_DIR) if f.endswith('not_a_dam.png')]

In [6]:
len(dam_file_list)

2

In [7]:
len(not_a_dam_file_list)

7

Do run this one to merge dam_list and not_a_dam_list

In [10]:
all_images_file_list = dam_file_list+not_a_dam_file_list

random.seed(0)
random.shuffle(all_images_file_list)

len(all_images_file_list)


9

#### Parameters

In [None]:
holdout_set_portion = .2
validation_set_portion = .15
Dams_per_round = 5#1000 # = max_dams_per_record 

#### Get South Africa geometry

In [13]:
def download_url_to_file(url, target_file_path):
    """Use requests to download a file.

    Parameters:
        url (string): url to file.
        target_file_path (string): local path to download the file.

    Returns:
        None.

    """
    try:
        response = requests.get(url, stream=True, timeout=REQUEST_TIMEOUT)
        try:
            os.makedirs(os.path.dirname(target_file_path))
        except OSError:
            pass
        with open(target_file_path, 'wb') as target_file:
            shutil.copyfileobj(response.raw, target_file)
        del response
    except:
        LOGGER.exception('download of {url} to {target_file_path} failed')
        # mods from LOGGER.exception(f'download of {url} to {target_file_path} failed')
        raise

        
        
        
tm_world_borders_zip_path = os.path.join(
        WORKSPACE_DIR, 'world_borders', os.path.basename(TM_WORLD_BORDERS_URL))
if not os.path.exists(tm_world_borders_zip_path):
    download_url_to_file(TM_WORLD_BORDERS_URL, tm_world_borders_zip_path)
    with zipfile.ZipFile(tm_world_borders_zip_path, 'r') as zip_ref:
        zip_ref.extractall(WORKSPACE_DIR)

tm_world_borders_vector_path = os.path.join(
    WORKSPACE_DIR, 'TM_WORLD_BORDERS-0.3.shp')
#tm_world_borders_vector = gdal.Open(ogr.Open(tm_world_borders_vector_path)) # Changed OpenEx to Open.
    #tm_world_borders_vector_path,ogr.Open(path))#, gdal.OF_VECTOR)
tm_world_borders_vector = ogr.Open(tm_world_borders_vector_path)
tm_world_borders_layer = tm_world_borders_vector.GetLayer()
for border_feature in tm_world_borders_layer:
    if border_feature.GetField('NAME') == 'South Africa':
        sa_geom = border_feature.GetGeometryRef()
        sa_geom_prep = shapely.prepared.prep(
            shapely.wkb.loads(sa_geom.ExportToWkb()))
        break
LOGGER.debug(sa_geom_prep)

NameError: name 'LOGGER' is not defined

# Now Make YOLO-ready data !

In [None]:
Utils: Function to make YOLO_ready data

In [None]:
def create_images_labels(images_file_list, iteration): 
    
    """ 
    
    Takes in folder of dam pngs, folder of bounding box json files
    Normalizes json data to darknet format (center x, center y, bbox width, bbox height)
    Creates new directories in darknet format
    
    """
    
    print('Starting to make YOLO-ready data, round %d' % iteration)
    
    random.seed(iteration)
    random_numbers_list = [random.random() for x in range(0, len(images_file_list))]
    random_number_iterator = 0
    
    for image_path in images_file_list:
        
        # read in image
        img = mpimg.imread(image_path)

        # get width and height
        img_w = img.shape[0]
        img_h = img.shape[1]

        # get matching bounding box json file
        json_path = image_path.replace('.png', '.json')
        if not os.path.exists(json_filepath):
            raise NameError("can't find bbox for %s" % json_path)
            
        # read json bounding box coordinates
        with open(json_path, 'r') as json_file: 
            image_metadata = json.load(json_file)
        
        # normalize to x-center, y-center, width, and height of bbox
        coords = image_metadata['pixel_bounding_box']
        avg_x = (coords[2] + coords[0]) / (2 * img_w)
        avg_y = (coords[1] + coords[3]) / (2 * img_h)
        nrm_w = (coords[2] - coords[0]) / img_w
        nrm_h = (coords[1] - coords[3]) / img_h
        nrm_xywh = np.array([avg_x, avg_y, nrm_w, nrm_h])

        # Define new label in YOLO format
        if 'not_a_dam' in image_path:
            dam_type = 'not_a_dam'
            label_str = ''
        else:
            dam_type = 'dam'
            label_str = '0 ' + str('%.6f'%nrm_xywh[0]) + ' ' + str('%.6f'%nrm_xywh[1]) + ' ' + str('%.6f'%nrm_xywh[2]) + ' ' + str('%.6f'%nrm_xywh[3])

            
            
        # - - -   - - -   - - -   
        # Choose whether this record will go to training or validation (=dev) set 
        try:
            centroid = image_metadata['lng_lat_centroid']
        except NameError:
            raise Exception("Missing lat/lon for in file", json_path)
            
            
        random_number = random_numbers_list[random_number_iterator]
        random_number_iterator+=1
        
        if sa_geom_prep.contains(shapely.geometry.Point(centroid[0], centroid[1])): # both for dams & not_a_dams
            writer = 'southaf_set'
            log = southaf_log
        elif random_number < holdout_set_portion:
            writer = 'test_set'
            log = test_log
        elif random_number > (1-validation_set_portion):
            writer = 'validation_set'
            log = validation_log
        else:
            writer = 'training_set'
            log = training_log
            
        # Write the file in the corresponding set
        
        ## Write image here:
        filename = 
            
        newimg_filepath = os.path.join(OUTPUTS_DIR,writer,'images',filename,'.png')
        shutil.copyfile(image_path, newimg_filepath)
        
        
        newtext_filepath = os.path.join(OUTPUTS_DIR,writer,'labels',filename,'.txt')
        file = open(newtext_filepath, 'w')
        file.write(label_str)
        file.close()

        # Add stats 
        log[dam_type] += 1

Full loop

In [22]:
os.mkdir(os.path.join(OUTPUTS_DIR,'images'))
os.mkdir(os.path.join(OUTPUTS_DIR,'labels'))

# Trash

In [None]:
def create_images_labels(png_folder, bbox_folder, classtype): 
    
    """ 
    
    Takes in folder of dam pngs, folder of bounding box json files
    Normalizes json data to darknet format (center x, center y, bbox width, bbox height)
    Creates new directories in darknet format
    
    """
    
    for image_filename in os.listdir(png_folder):
       
        # read in image
        img = mpimg.imread(os.path.join(png_folder,image_filename))

        # get width and height
        img_w = img.shape[0]
        img_h = img.shape[1]

        # get matching bounding box json file
        json_filename = image_filename[:-4] + '.json'
        json_filepath = os.path.join(bbox_folder, json_filename)
        if not os.path.exists(json_filepath):
            continue
            #raise NameError("can't find bbox for %s" % json_filename)
            
        # read json bounding box coordinates
        with open(json_filepath) as json_file: 
            data = json.load(json_file)
            coords = data['pixel_bounding_box']

        # normalize to x-center, y-center, width, and height of bbox
        avg_x = (coords[2] + coords[0]) / (2 * img_w)
        avg_y = (coords[1] + coords[3]) / (2 * img_h)
        nrm_w = (coords[2] - coords[0]) / img_w
        nrm_h = (coords[1] - coords[3]) / img_h
        nrm_xywh = np.array([avg_x, avg_y, nrm_w, nrm_h])

        # create new folder for labels
#         label_filename =
#         shutil.copyfile(os.path.join(OUTPUTS_DIR,'labels',image_filename[:-4],'.txt'))

        # write new label file and move to new folder
        if classtype == 'dam':
            label_str = '0 ' + str('%.6f'%nrm_xywh[0]) + ' ' + str('%.6f'%nrm_xywh[1]) + ' ' + str('%.6f'%nrm_xywh[2]) + ' ' + str('%.6f'%nrm_xywh[3])
        else:
            label_str = ''
        
        shutil.copyfile(json_filepath, '/labels' + json_filename[:-5] + '.txt')   
        file = open(os.path.join(OUTPUTS_DIR, 'labels') + '/' + json_filename[:-5] + '.txt', 'w')
        file.write(label_str)
        file.close()

        # move images to new folder
        new_image_filepath = os.path.join('images', image_filename)
        shutil.copyfile(png_folder + '/' + image_filename, os.path.join(OUTPUTS_DIR, new_image_filepath))

In [120]:
# in progress

# make directories for dam_png_no_bbox and not_a_dam_no_bbox
#os.mkdir(os.path.join(OUTPUTS_DIR, 'dam_png_no_bbox'))
for f in os.listdir(DAM_IMAGE_DIR):
    if f.endswith('.png') and not f.endswith('_bb.png'):
        shutil.copyfile(os.path.join(DAM_IMAGE_DIR, f), 'dam_png_no_bbox/'+f[:-4])
#os.mkdir(os.path.join(OUTPUTS_DIR, 'not_a_dam_png_no_bbox'))
for f in os.listdir(NOT_A_DAM_IMAGE_DIR):
    if f.endswith('.png') and not f.endswith('_bb.png'):
        shutil.copyfile(os.path.join(DAM_IMAGE_DIR, f), 'not_a_dam_png_no_bbox'+f[:-4])

FileNotFoundError: [Errno 2] No such file or directory: '../../storage/yolov3-m/dam_images/1656-1206_not_a_dam.png'

In [25]:
### TODO: south africa set

# make the test/train/validation/south africa splits

# parameters
# holdout_set_portion = 0.20
validation_set_portion = 0.15
test_set_portion = 0.30

train_set = all_images_file_list.copy()

south_africa_set = []

# holdout_set = []
# for i in range(int(holdout_set_portion * all_images_original_length):
#     holdout_set.append(all_images_file_list_copy[i])
#     all_images_file_list_copy.pop(i)

validation_set = []
for i in range(int(validation_set_portion * len(all_images_file_list))):
    validation_set.append(train_set[i])
    train_set.pop(i)

test_set = []
for i in range(int(test_set_portion * len(all_images_file_list))):
    test_set.append(train_set[i])
    train_set.pop(i)

In [27]:
# test

print('len of validation set:')
print(len(validation_set))
print('len of test set:')
print(len(test_set))
print('len of training set:')
print(len(train_set))

len of validation set:
16
len of test set:
32
len of training set:
60


In [52]:
# make json file directory
#os.mkdir(os.path.join(OUTPUTS_DIR, 'json_file_directory'))

JSON_FILE_DIR = os.path.join(OUTPUTS_DIR, 'json_file_directory')
for f in os.listdir(DAM_IMAGE_DIR):
    if f.endswith('.json'):
        shutil.copyfile(os.path.join(DAM_IMAGE_DIR,f), os.path.join(JSON_FILE_DIR,f))
for f in os.listdir(NOT_A_DAM_IMAGE_DIR):
    if f.endswith('.json'):
        shutil.copyfile(os.path.join(NOT_A_DAM_IMAGE_DIR,f), os.path.join(JSON_FILE_DIR,f))

In [54]:
# test
print(os.listdir(JSON_FILE_DIR)[:10])

['119497_clipped.json', '119453_clipped.json', '119455_clipped.json', '119484_clipped.json', '1656-1195_not_a_dam.json', '119472_clipped.json', '119489_clipped.json', '119402_clipped.json', '119490_clipped.json', '119429_clipped.json']


In [114]:
def create_images_labels(png_folder, bbox_folder, classtype): 
    
    """ 
    
    Takes in folder of dam pngs, folder of bounding box json files
    Normalizes json data to darknet format (center x, center y, bbox width, bbox height)
    Creates new directories in darknet format
    
    """
    
    for image_filename in os.listdir(png_folder):
       
        # read in image
        img = mpimg.imread(os.path.join(png_folder,image_filename))

        # find width and height
        img_w = img.shape[0]
        img_h = img.shape[1]

        # find matching bounding box json file
        json_filename = image_filename[:-4] + '.json'
        json_filepath = os.path.join(bbox_folder, json_filename)
        if not os.path.exists(json_filepath):
            continue
            #raise NameError("can't find bbox for %s" % json_filename)
            
        # read json bounding box coordinates
        with open(json_filepath) as json_file: 
            data = json.load(json_file)
            coords = data['pixel_bounding_box']

        # normalize to x-center, y-center, width, and height of bbox
        avg_x = (coords[2] + coords[0]) / (2 * img_w)
        avg_y = (coords[1] + coords[3]) / (2 * img_h)
        nrm_w = (coords[2] - coords[0]) / img_w
        nrm_h = (coords[1] - coords[3]) / img_h
        nrm_xywh = np.array([avg_x, avg_y, nrm_w, nrm_h])

        # create new folder for labels
#         label_filename =
#         shutil.copyfile(os.path.join(OUTPUTS_DIR,'labels',image_filename[:-4],'.txt'))

        # write new label file and move to new folder
        if classtype == 'dam':
            label_str = '0 ' + str('%.6f'%nrm_xywh[0]) + ' ' + str('%.6f'%nrm_xywh[1]) + ' ' + str('%.6f'%nrm_xywh[2]) + ' ' + str('%.6f'%nrm_xywh[3])
        else:
            label_str = ''
        
        shutil.copyfile(json_filepath, '/labels' + json_filename[:-5] + '.txt')   
        file = open(os.path.join(OUTPUTS_DIR, 'labels') + '/' + json_filename[:-5] + '.txt', 'w')
        file.write(label_str)
        file.close()

        # move images to new folder
        new_image_filepath = os.path.join('images', image_filename)
        shutil.copyfile(png_folder + '/' + image_filename, os.path.join(OUTPUTS_DIR, new_image_filepath))

In [115]:
# create the image and label files
create_images_labels(DAM_IMAGE_DIR, JSON_FILE_DIR, 'dam')
create_images_labels(NOT_A_DAM_IMAGE_DIR, JSON_FILE_DIR, 'not_dam')

OSError: cannot identify image file '../../storage/yolov3-m/dam_images/119453_clipped.png.aux.xml'

In [5]:
# function to move files of certain type to other folders

def create_new_folder (src, dst, filetype):
    
    # create destination folder
    os.mkdir(dst)
    
    # move files of specified format to destination folder
    for filename in os.listdir(src):
        if filename.endswith(filetype):
            os.rename(src + '/' + filename, dst + '/' + filename)

In [None]:
# there are four types of files in these folders: xml, png, png w/ bboxes, and json
# darknet needs only the images (png) and the bbox labels (json)

# move png files (not _bb.png files) to new folder
create_new_folder('dam_images', 'dam_png', '.png') 
create_new_folder('dam_png', 'dam_bb_images', '_bb.png') 
create_new_folder('not_a_dam_images', 'not_a_dam_png', '.png') 
create_new_folder('not_a_dam_png', 'not_a_dam_bb_images', '_bb.png') 

# move json files to new folder
create_new_folder('dam_images', 'dam_bboxes', '.json')
create_new_folder('not_a_dam_images', 'not_a_dam_bboxes', '.json')

'filename'

In [7]:
def create_images_labels(png_folder, bbox_folder, classtype): 
    
    """ 
    Takes in folder of dam pngs, folder of bounding box json files
    Normalizes json data to darknet format (center x, center y, bbox width, bbox height)
    Creates new directories in darknet format
    """
    
    for image_filename in os.listdir(png_folder):
       
        # read in image
        img = mpimg.imread(os.path.join(png_folder,image_filename))

        # find width and height
        img_w = img.shape[0]
        img_h = img.shape[1]
        
        # find matching bounding box json file
        json_filepath = os.path.join(bbox_folder,image_filename,'.json')
        if not os.path.exists(json_filepath):
            raise NameError("can't find bbox for %s'" % image_filename)

        # read json bounding box coordinates
        with open(json_filepath) as json_file: 
            data = json.load(json_file)
            coords = data['pixel_bounding_box']

        # normalize to x-center, y-center, width, and height of bbox
        avg_x = (coords[2] + coords[0]) / (2 * img_w)
        avg_y = (coords[1] + coords[3]) / (2 * img_h)
        nrm_w = (coords[2] - coords[0]) / img_w
        nrm_h = (coords[1] - coords[3]) / img_h
        nrm_xywh = np.array([avg_x, avg_y, nrm_w, nrm_h])

        # create new folder for labels
        shutil.copyfile(os.path.join(OUTPUTS_DIR,'labels',image_filename[:-4],'.txt'))

        # write new label file and move to new folder
        if classtype == 'dam':
            label_str = '0 ' + str('%.6f'%nrm_xywh[0]) + ' ' + str('%.6f'%nrm_xywh[1]) + ' ' + str('%.6f'%nrm_xywh[2]) + ' ' + str('%.6f'%nrm_xywh[3])
        else:
            label_str = ''
        shutil.copyfile(bbox_folder + '/' + bbox, 'labels/' + bbox[:-5] + '.txt')   
        file = open('labels' + '/' + bbox[:-5] + '.txt', 'w')
        file.write(label_str)
        file.close()

        # move images to new folder
        shutil.copyfile(png_folder + '/' + image_filename, 'images/' + png)

In [7]:
# TODOS
# X delete unwanted directories
# - create test and train folders
# - split test and train data into two folders
# X create .txt files for test and train data file paths
# X~ create .names file
# X~ create .data file
# X~ download cfg file
# X download pre-trained weights

In [25]:
# delete unwanted directories
delete_folders = ['dam_bb_images', 'dam_bboxes', 'dam_images', 'dam_png', 'not_a_dam_bb_images', 'not_a_dam_bboxes',
                 'not_a_dam_images', 'not_a_dam_png']
for folder in delete_folders:
    shutil.rmtree(folder)

AttributeError: module 'shutil' has no attribute 'remove'

In [23]:
# create .txt files for test adn train data file paths

def create_txt_file (split):
    files = os.listdir('images/' + split)
    cwd = os.getcwd()
    txt = open(cwd + '/' + split + '_images.txt', 'w')
    for file in files:
        if file != '.ipynb_checkpoints':
            txt.write(cwd + '/images/' + split + '/' + file + '\n')
    txt.close()

In [24]:
create_txt_file('test')
create_txt_file('train')

### Maybe utils

In [None]:
# unzip files

not_dams = zf.ZipFile('not_a_dam_images.zip', 'r')
not_dams.extractall()
not_dams.close()

dams = zf.ZipFile('dam_images.zip', 'r')
dams.extractall()
dams.close()