Installing libraries

In [1]:
!pip install segments-ai
!pip install boltons

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
from segments import SegmentsClient, SegmentsDataset
from segments.utils import export_dataset
import os
import shutil
import pytz
import datetime
import requests
import random
import json
import pycocotools.mask as mask
import cv2
from boltons.iterutils import remap

Methods

In [2]:
"""Loading release file as dictionary, rename filename, filter and resave"""
def cleanup_json(directory, filename):
    release=os.path.join(directory, filename)
    f = open(release)
    data = json.load(f)
    f.close()
    for item in data["dataset"]["samples"]: #Rename filename in json (error in Amazon S3 bucket)
      item["name"] = item["name"].replace('all/', '')
    data = [d for d in data["dataset"]["samples"] if d["labels"]["ground-truth"] != None] # filter unlabeled images
    data = [d for d in data if d["labels"]["ground-truth"]["label_status"] != "SKIPPED"] # filter skipped images
    data = [d for d in data if d["labels"]["ground-truth"]["attributes"]["annotations"] != []] # filter images without annotations (pv nor thermal)
    with open(os.path.join(directory, "solar_panels_filt.json"),'w') as f: #save as *_filt.json
      json.dump(data, f)

In [102]:
"""Loading release file as dictionary, rename filename, light filter and resave for exploratory data analysis"""
def cleanup_json_eda(directory, filename):
    release=os.path.join(directory, filename)
    f = open(release)
    data = json.load(f)
    f.close()
    for item in data["dataset"]["samples"]: #Rename filename in json (error in Amazon S3 bucket)
      item["name"] = item["name"].replace('all/', '')
    data = [d for d in data["dataset"]["samples"] if d["labels"]["ground-truth"] != None] # filter unlabeled images
    data = [d for d in data if d["labels"]["ground-truth"]["label_status"] != "SKIPPED"] # filter skipped images
    with open(os.path.join(directory, "solar_panels_eda.json"),'w') as f: #save as *_filt.json
      json.dump(data, f)

In [3]:
def polygonFromMask(maskedArr):
  # adapted from https://github.com/hazirbas/coco-json-converter/blob/master/generate_coco_json.py
  contours, _ = cv2.findContours(maskedArr, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
  segmentation = []
  valid_poly = 0
  for contour in contours:
  # Valid polygons have >= 6 coordinates (3 points)
     if contour.size >= 6:
        segmentation.append(contour.astype(float).flatten().tolist())
        valid_poly += 1
  if valid_poly == 0:
     raise ValueError
  return segmentation

In [4]:
def cleanup(directory, filename):
    path=os.path.join(directory, filename)
    json_file = open(path)
    coco_json = json.load(json_file)
    json_file.close()
    for i in coco_json["categories"]: # change 0 to 1 (pv), change 1 to 2 (thermal) => 0 belongs to background class
        if i["id"] == 1:
            i["id"] = 2
        elif i["id"] == 0:
            i["id"] = 1
    for j in coco_json["annotations"]: # change 0 to 1 (pv), change 1 to 2 (thermal) => 0 belongs to background class
        if j["category_id"] == 1:
            j["category_id"] = 2
        elif j["category_id"] == 0:
            j["category_id"] = 1
    for l in coco_json["images"]: # change png to tif
        l["file_name"] = l["file_name"].replace(".png", ".tif")
    bad_keys = {'size'}  #delete size
    for m in coco_json["annotations"]: # decode mask
        maskedArr = mask.decode(m["segmentation"])
        m["segmentation"]["counts"] = polygonFromMask(maskedArr)
        area = float((maskedArr > 0.0).sum())
        m["area"] = area
    drop_keys = lambda paths, key, value: key not in bad_keys
    coco_json = remap(coco_json, visit=drop_keys)
    with open(filename, "w") as fp:
        json.dump(coco_json, fp)

In [5]:
def log(msg='...', path=''):
    tz_ZH = pytz.timezone('Europe/Zurich') 
    now = datetime.datetime.now(tz_ZH)
    now_string = now.strftime("%H:%M:%S")
    print('log: {} {:<20s} {:>45}'.format(now_string, msg, path))

In [6]:
def download_file_from_google_drive(id, destination):
    # source https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url/60132855#60132855
    def get_confirm_token(response):
        for key, value in response.cookies.items():
            if key.startswith('download_warning'):
                return value
        return None

    def save_response_content(response, destination):
        CHUNK_SIZE = 32768

        with open(destination, "wb") as f:
             for chunk in response.iter_content(CHUNK_SIZE):
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)

    URL = "https://docs.google.com/uc?export=download"
    session = requests.Session()
    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination) 

In [7]:
parent_dir='data'
if not os.path.exists(parent_dir):    
    log('creating directory:',parent_dir)
    os.makedirs(parent_dir)
    
directories=['01_downloads']
for dir in directories: 
    path = os.path.join(parent_dir, dir) 
    if not os.path.exists(path):    
        os.makedirs(path) 
        log('creating directory:',path)

Downloading released labeling data from Google Drive

In [8]:
downloading = {"solar_panels-v0.5_copy": "15cuEnval79LIX0K9wadmbG-yevGjNgKu"} # file name : file id
download_dir = os.path.join(parent_dir, directories[0]) # path to directory "01_downloads"

for file_name, file_id in downloading.items():
  path=os.path.join(download_dir, file_name+".json")
  log("downloading:", path)
  download_file_from_google_drive(file_id, path) # download and save

log: 11:44:04 downloading:         data\01_downloads\solar_panels-v0.5_copy.json


Cleanup JSON

In [9]:
path = "C:/Users/Thomas/PycharmProjects/pythonProject/data/01_downloads/"  # path to loaded json
file = "solar_panels-v0.5_copy.json" #json filename
cleanup_json(path, file) #make improvements for further preparations, save "solar_panels_filt.json"
#cleanup_json_eda(path, file) #make improvements for exploratory data analysis, save "solar_panels_eda.json"

=> Unfortunately manual work:
+ Adding filtered keys again according "solar_panels-v0.5_copy.json"
+ resave json in same destination path

Reopen *_filt.json

In [10]:
# Initialize a SegmentsDataset from the release file
client = SegmentsClient(api_key='0e37cf5cca4fe748d5277b9893d3a95a08f2bd31')
release = "C:/Users/Thomas/PycharmProjects/pythonProject/data/01_downloads/solar_panels_filt.json" # eventually adjusting path
dataset = SegmentsDataset(release, labelset='ground-truth', filter_by=['labeled'])

Initialized successfully.
Initializing dataset...
Preloading all samples. This may take a while...


100%|██████████| 2179/2179 [00:03<00:00, 569.65it/s]

Initialized dataset with 2179 images.





<h4>Splitting<h4>

Generating list of names of labeled images

In [11]:
imageNames = []
for sample in dataset:
  imageNames.append(sample["name"])

Shuffling name list

In [12]:
imageNames.sort() # make sure that the filenames have a fixed order before shuffling
random.seed(230) # make it reproducible
random.shuffle(imageNames) # shuffles the ordering of filenames (deterministic given the chosen seed)

Splitting into train, validation, test set

In [13]:
split_1 = int(0.5 * len(imageNames)) # 50 %
split_2 = int(0.75 * len(imageNames)) # 25 % each
train_filenames = imageNames[:split_1]
val_filenames = imageNames[split_1:split_2]
test_filenames = imageNames[split_2:]

Change format name ".png" to ".tif"

In [14]:
train_filenames_tif = [item.replace(".png", ".tif") for item in train_filenames]
val_filenames_tif = [item.replace(".png", ".tif") for item in val_filenames]
test_filenames_tif = [item.replace(".png", ".tif") for item in test_filenames]

Copying labeled images (.tif) according to list to a new folder (.tif)

In [16]:
src_dir = "D:/Model/all_labeled_uint8_hm" # path to current files
dst_dir = "D:/Model/2_Histogram-Matching2/train/" # new file path
for imageName in train_filenames_tif:
    shutil.copy(os.path.join(src_dir, imageName), dst_dir)

In [17]:
src_dir = "D:/Model/all_labeled_uint8_hm" # path to current files
dst_dir = "D:/Model/2_Histogram-Matching2/val/" # new file path
for imageName in val_filenames_tif:
    shutil.copy(os.path.join(src_dir, imageName), dst_dir)

In [18]:
src_dir = "D:/Model/all_labeled_uint8_hm" # path to current files
dst_dir = "D:/Model/2_Histogram-Matching2/test/" # new file path
for imageName in test_filenames_tif:
    shutil.copy(os.path.join(src_dir, imageName), dst_dir)

Comparing dictionary with each set

In [19]:
f = open(release, "r")
data = json.loads(f.read())

In [20]:
train = [d for d in data["dataset"]["samples"] if d["name"] in train_filenames]
validation = [d for d in data["dataset"]["samples"] if d["name"] in val_filenames]
test = [d for d in data["dataset"]["samples"] if d["name"] in test_filenames]

resave json file

In [21]:
with open("train.json", "w") as fp:
  json.dump(train, fp)

In [22]:
with open("val.json", "w") as fp:
  json.dump(validation, fp)

In [23]:
with open("test.json", "w") as fp:
  json.dump(test, fp)

=> Unfortunately manual work:
+ Adding filtered keys again according "solar_panels-v0.5_filt.json"
+ resave json

<h4>Exporting labels to COCO instance segmentation format and save as *.json<h4>

In [3]:
dataset_train = SegmentsDataset("C:/Users/Thomas/PycharmProjects/pythonProject/data/01_downloads/solar_panels_filt.json", labelset='ground-truth', filter_by=['labeled'])
export_dataset(dataset_train, export_format='coco-instance')

Initializing dataset...
Preloading all samples. This may take a while...


100%|██████████| 2179/2179 [00:03<00:00, 673.67it/s]


Initialized dataset with 2179 images.
Exporting dataset. This may take a while...


100%|██████████| 2179/2179 [00:30<00:00, 70.83it/s]


Exported to .\export_coco-instance_charlie4611-_solar_panels_v0.5.4.json. Images and labels in segments\charlie4611-_solar_panels\v0.5.4


('.\\export_coco-instance_charlie4611-_solar_panels_v0.5.4.json',
 'segments\\charlie4611-_solar_panels\\v0.5.4')

In [30]:
dataset_validation = SegmentsDataset("C:/Users/Thomas/PycharmProjects/pythonProject/val.json", labelset='ground-truth', filter_by=['labeled'])
export_dataset(dataset_validation, export_format='coco-instance')

Initializing dataset...
Preloading all samples. This may take a while...


100%|██████████| 545/545 [00:00<00:00, 1054.28it/s]


Initialized dataset with 545 images.
Exporting dataset. This may take a while...


100%|██████████| 545/545 [00:07<00:00, 73.03it/s]

Exported to .\export_coco-instance_charlie4611-_solar_panels_v0.5.4.json. Images and labels in segments\charlie4611-_solar_panels\v0.5.4





('.\\export_coco-instance_charlie4611-_solar_panels_v0.5.4.json',
 'segments\\charlie4611-_solar_panels\\v0.5.4')

In [29]:
dataset_test = SegmentsDataset("C:/Users/Thomas/PycharmProjects/pythonProject/test.json", labelset='ground-truth', filter_by=['labeled'])
export_dataset(dataset_test, export_format='coco-instance')

Initializing dataset...
Preloading all samples. This may take a while...


100%|██████████| 545/545 [00:00<00:00, 1023.29it/s]


Initialized dataset with 545 images.
Exporting dataset. This may take a while...


100%|██████████| 545/545 [00:07<00:00, 71.38it/s]


Exported to .\export_coco-instance_charlie4611-_solar_panels_v0.5.4.json. Images and labels in segments\charlie4611-_solar_panels\v0.5.4


('.\\export_coco-instance_charlie4611-_solar_panels_v0.5.4.json',
 'segments\\charlie4611-_solar_panels\\v0.5.4')

<h4>Some changes<h4>


for details see cleanup-function above

In [31]:
directory = "C:/Users/Thomas/PycharmProjects/pythonProject/" # do change
cleanup(directory, "export_coco-instance_train.json")

In [32]:
directory = "C:/Users/Thomas/PycharmProjects/pythonProject/" # do change
cleanup(directory, "export_coco-instance_val.json")

In [33]:
directory = "C:/Users/Thomas/PycharmProjects/pythonProject/" # do change
cleanup(directory, "export_coco-instance_test.json")