/
package.py
91 lines (79 loc) · 3.4 KB
/
package.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# pylint: disable=too-many-locals, unused-argument, too-many-arguments
"""Generate an .npz file containing arrays for training machine learning algorithms"""
from os import path as op
from urllib.parse import urlparse
import numpy as np
from PIL import Image
def package_directory(dest_folder, classes, imagery, ml_type, seed=False, train_size=0.8, **kwargs):
"""Generate an .npz file containing arrays for training machine learning algorithms
Parameters
------------
dest_folder: str
Folder to save labels, tiles, and final numpy arrays into
classes: list
A list of classes for machine learning training. Each class is defined as a dict
with two required properties:
- name: class name
- filter: A Mapbox GL Filter.
See the README for more details
imagery: str
Imagery template to download satellite images from.
Ex: http://a.tiles.mapbox.com/v4/mapbox.satellite/{z}/{x}/{y}.jpg?access_token=ACCESS_TOKEN
ml_type: str
Defines the type of machine learning. One of "classification", "object-detection", or "segmentation"
train_percent: float
Portion of the data to use in training, the remainder is used as test data (default 0.8)
**kwargs: dict
Other properties from CLI config passed as keywords to other utility functions
"""
# if a seed is given, use it
if seed:
np.random.seed(seed)
# open labels file, create tile array
labels_file = op.join(dest_folder, 'labels.npz')
labels = np.load(labels_file)
tile_names = [tile for tile in labels.files]
tile_names.sort()
tiles = np.array(tile_names)
np.random.shuffle(tiles)
# find maximum number of features in advance so numpy shapes match
if ml_type == 'object-detection':
max_features = 0
for tile in tiles.files:
features = len(tiles[tile][0])
if features > max_features:
max_features = features
x_vals = []
y_vals = []
# open the images and load those plus the labels into the final arrays
o = urlparse(imagery)
_, image_format = op.splitext(o.path)
for tile in tiles:
image_file = op.join(dest_folder, 'tiles', '{}{}'.format(tile, image_format))
try:
img = Image.open(image_file)
except FileNotFoundError:
print('Couldn\'t open {}, skipping'.format(image_file))
continue
np_image = np.array(img)
img.close()
x_vals.append(np_image)
if ml_type == 'classification':
y_vals.append(labels[tile])
elif ml_type == 'object-detection':
# zero pad object-detection arrays
cl = labels[tile]
y_vals.append(np.concatenate((cl, np.zeros((max_features - len(cl), 5)))))
elif ml_type == 'segmentation':
y_vals.append(labels[tile][..., np.newaxis]) # Add grayscale channel
# split into train and test
split_index = int(len(x_vals) * train_size)
# convert lists to numpy arrays
x_vals = np.array(x_vals, dtype=np.uint8)
y_vals = np.array(y_vals, dtype=np.uint8)
print('Saving packaged file to {}'.format(op.join(dest_folder, 'data.npz')))
np.savez(op.join(dest_folder, 'data.npz'),
x_train=x_vals[:split_index, ...],
y_train=y_vals[:split_index, ...],
x_test=x_vals[split_index:, ...],
y_test=y_vals[split_index:, ...])