In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# incorporate config settings
label_date_recent = '09_26_24.json' # TODO automate this
datasets_to_split = ['mars_hirise']

slice_size = [500,500]

grayscale = False
slice_images = True
split_dataset = True
split_json = True

In [3]:
!python -m pip install split-folders
!pip install ujson



In [4]:
import pprint
import os
import splitfolders
from google.colab import drive

In [5]:
# Connect to GDrive to access image files and labels
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [6]:
# Change working directory to drive so that everything saves to drive
%cd '/content/gdrive/MyDrive/__Shared/Andrew/working_directory'
%pwd

/content/gdrive/.shortcut-targets-by-id/1Bhg_-iEsxa3Aq4v-SLxPVOpKrY5TpKaq/Andrew/working_directory


'/content/gdrive/.shortcut-targets-by-id/1Bhg_-iEsxa3Aq4v-SLxPVOpKrY5TpKaq/Andrew/working_directory'

In [7]:
from config_utils import Config
from data_processing.data_load import get_anno_path, split_json_train_test_val
from data_processing.data_preprocess import slice_batch, grayscale_image

In [8]:
c = Config()
for setting in c.cfg['preprocess_settings'].keys():
  if setting in locals():
    c.cfg['preprocess_settings'][setting] = locals()[setting]

c.get_paths()

In [9]:
pprint.pp(c.preproc_paths)
pprint.pp(c.split_paths)

{'white_sands_heli': {'top_dir': 'output/data/white_sands_heli/preprocess__slice_images_grayscale',
                      'images_dir': 'output/data/white_sands_heli/preprocess__slice_images_grayscale/images',
                      'labels': 'output/data/white_sands_heli/preprocess__slice_images_grayscale/labels.json'},
 'white_sands_sat': {'top_dir': 'output/data/white_sands_sat/preprocess__slice_images_grayscale',
                     'images_dir': 'output/data/white_sands_sat/preprocess__slice_images_grayscale/images',
                     'labels': 'output/data/white_sands_sat/preprocess__slice_images_grayscale/labels.json'},
 'mars_hirise': {'top_dir': 'output/data/mars_hirise/preprocess__slice_images_grayscale',
                 'images_dir': 'output/data/mars_hirise/preprocess__slice_images_grayscale/images',
                 'labels': 'output/data/mars_hirise/preprocess__slice_images_grayscale/labels.json'}}
{'white_sands_heli': {'top_dir': 'output/data/white_sands_heli',
     

## Dataset Processing

### Grayscale

In [10]:
# Do not need to rerun - we saved these images to the drive
# 4 minutes to convert to grayscale
gs = True
if grayscale:
  if 'grayscale' in c.cfg['preprocess_steps']:
    gs = True # will redirect slice images if grayscale happening first
    for dataset in datasets_to_split:
      img_dir = c.raw_paths[dataset]['images_dir']
      out_dir = os.path.join(c.preproc_paths[dataset]['top_dir'], 'grayscale')
      for im in os.listdir(img_dir):
        grayscale_image(img_dir, out_dir, im)

### Slicing

In [11]:
# Do not need to rerun - we saved these images to the drive; only need to rerun if dimensions change
# chat with gab before rerunning at a different size
# 14 minutes to perform

if c.cfg['preprocess_settings']['slice_size'] != [500,500] or slice_images:
  if 'slice_images' in c.cfg['preprocess_steps']:
    w, h = c.cfg['preprocess_settings']['slice_size']
    for dataset in datasets_to_split:
      if gs:
        img_dir = os.path.join(c.preproc_paths[dataset]['top_dir'], 'grayscale')
      else:
        img_dir = c.raw_paths[dataset]['images_dir']
      print(img_dir)
      anno_path = get_anno_path(c.raw_paths[dataset]['labels_dir'], label_date_recent)
      out_dir = c.preproc_paths[dataset]['top_dir']

      # slice images & relabel jsons
      sliced_anno = slice_batch(img_dir,
                                anno_path,
                                (w,h),
                                out_dir,
                                batch_size=None,
                                save_images=False,
                                #save_images=True,
                                save_dict=False)

      # dummy_dir = os.path.join(c.preproc_paths[dataset]['top_dir'], 'dummy')
      # if (len(os.listdir(out_dir))==1) and (not os.path.exists(dummy_dir)):
      #     os.makedirs(dummy_dir)

output/data/mars_hirise/preprocess__slice_images_grayscale/grayscale
images to slice: 11
image number: 1
slice index: 0 of 64
slice index: 50 of 64
image number: 2
slice index: 0 of 64
slice index: 50 of 64
image number: 3
slice index: 0 of 64
slice index: 50 of 64
image number: 4
slice index: 0 of 64
slice index: 50 of 64
image number: 5
slice index: 0 of 64
slice index: 50 of 64
image number: 6
slice index: 0 of 64
slice index: 50 of 64
image number: 7
slice index: 0 of 64
slice index: 50 of 64
image number: 8
slice index: 0 of 64
slice index: 50 of 64
image number: 9
slice index: 0 of 56
slice index: 50 of 56
image number: 10
slice index: 0 of 62
slice index: 50 of 62
image number: 11
slice index: 0 of 64
slice index: 50 of 64
length images: 694
saving json


### Dataset Splitting

In [12]:
# Split images
# Split the image folders into 'train', 'val', and 'test' folders
if c.cfg['preprocess_settings']['slice_size'] != [500,500] or split_dataset:
  for dataset in datasets_to_split:
      splitfolders.ratio(c.preproc_paths[dataset]['top_dir'],
                        output=c.split_paths[dataset]['top_dir'],
                        seed=1420,
                        ratio=(0.7, 0.2, 0.1)) # train, val, test

Copying files: 705 files [00:19, 35.60 files/s]


In [13]:
# Split json labels
if split_json:
    if len(c.cfg['preprocess_steps']) > 0:
      anno_path = c.preproc_paths[dataset]['labels']
    else:
      anno_path = get_anno_path(c.raw_paths[dataset]['labels_dir'], label_date_recent)

    split_json_train_test_val(json_filepath = anno_path,
                              image_dirs = [c.split_paths[dataset]['train_images_dir'],
                                            c.split_paths[dataset]['val_images_dir'],
                                            c.split_paths[dataset]['test_images_dir']],
                              anno_dirs = [c.split_paths[dataset]['train_labels'],
                                           c.split_paths[dataset]['val_labels'],
                                           c.split_paths[dataset]['test_labels']])

In [14]:
# check if all datasets exist

for dataset in c.cfg['datasets']:
  assert os.path.exists(c.split_paths[dataset]['train_images_dir'])
  assert os.path.exists(c.split_paths[dataset]['val_images_dir'])
  assert os.path.exists(c.split_paths[dataset]['test_images_dir'])

In [15]:
# This cell and the one below to test splitting outputs - ignore for now
###### won't work yet
# import random
# import cv2
# from matplotlib import pyplot as plt
# from google.colab.patches import cv2_imshow
#
# train_dataset_dicts = DatasetCatalog.get(train_datasets[1])
# train_metadata = MetadataCatalog.get(train_datasets[1])

In [16]:
# for d in random.sample(train_dataset_dicts, 6):
#     img = cv2.imread(d["file_name"])
#     visualizer = Visualizer(img[:, :, ::-1], metadata=train_metadata, scale=0.5)
#     vis = visualizer.draw_dataset_dict(d)
#     plt.imshow(vis.get_image()[:, :, ::-1])
#     plt.show()

In [18]:
# preprocess inference images - grayscale only

input_images_directory = '/content/gdrive/MyDrive/__Shared/Andrew/working_directory/data_sources/mars_hirise/not annotated/images'
output_dir = '/content/gdrive/MyDrive/__Shared/Andrew/working_directory/output/data/mars_hirise/not_annotated/grayscale'


img_dir = input_images_directory
out_dir = output_dir
for im in os.listdir(img_dir):
  grayscale_image(img_dir, out_dir, im)