# [Dependencies](https://spacenetchallenge.github.io/#Dependencies)
> The [AWS Command Line Interface (CLI)](https://aws.amazon.com/cli/) must be installed with an active AWS account. Configure the AWS CLI using ‘aws configure’

# [Accessing the SpaceNet Data on AWS](https://aws.amazon.com/public-datasets/spacenet/#Accessing_the_SpaceNet_Data_on_AWS)
> The SpaceNet dataset is being released in several Areas of Interest. All AOIs will follow a similar directory structure and data format. The imagery is GeoTIFF satellite imagery and corresponding GeoJSON building footprints...

> For more detailed information on how to access specific files within the dataset, see [here](https://github.com/SpaceNetChallenge/utilities/tree/master/content/download_instructions).

> _The spacenet-dataset S3 bucket is provided as a Requester Pays bucket, see [here](https://docs.aws.amazon.com/AmazonS3/latest/dev/RequesterPaysBuckets.html) for more information._

# Programmatically Download Rio Imagery and Building Footprints with [Boto](https://boto3.readthedocs.io/en/latest/index.html), the AWS SDK for Python
Since the bucket is Request Pays, we use a [S3Transfer](https://boto3.readthedocs.io/en/latest/reference/customizations/s3.html#boto3.s3.transfer.S3Transfer) with the 'RequestPayer' download argument.

In [1]:
print("Setting up paths for download")

import os
bucket = "spacenet-dataset"

aoi_path = "AOI_1_Rio"
aoi_data_path = os.path.join(aoi_path, "srcData")
building_labels_path = os.path.join(aoi_data_path, "buildingLabels")
mosaic_3band_path = os.path.join(aoi_data_path, "mosaic_3band")

tmp_path = "/tmp"
spacenet_data_path = os.path.join(tmp_path, "spacenet-data")
if not os.path.exists(spacenet_data_path):
    os.makedirs(spacenet_data_path)

Setting up paths for download


In [2]:
import time
import boto3
from osgeo import gdal

print("Starting download")
start = time.time()

client = boto3.client("s3")
transfer = boto3.s3.transfer.S3Transfer(client)

print("Downloading Rio imagery")

mosaic_3band_object_list = client.list_objects_v2(
    Bucket=bucket, Prefix=mosaic_3band_path,
    RequestPayer='requester')
mosaic_3band_key_list = [obj["Key"] for obj in mosaic_3band_object_list["Contents"]]

for mosiac_3band_key in mosaic_3band_key_list:
    print("Downloading %s" % mosiac_3band_key)
    mosiac_3band_name = mosiac_3band_key.split("/")[-1]
    mosiac_3band_filename = os.path.join(tmp_path, mosiac_3band_name)
    if (not os.path.isfile(mosiac_3band_filename)):
        transfer.download_file(
            bucket=bucket, key=mosiac_3band_key, filename=mosiac_3band_filename,
            extra_args={"RequestPayer":"requester"})
        # Decompressor.scala: "compression type JPEG is not supported by this reader."
        translated_mosiac_3band_filename = os.path.join(spacenet_data_path, mosiac_3band_name)
        print("Translating without JPEG compression to %s" % translated_mosiac_3band_filename)
        gdal.Translate(
            destName=translated_mosiac_3band_filename, srcDS=mosiac_3band_filename,
            creationOptions = ['COMPRESS=LZW']
        )

print("Downloading Rio outline")
outline_name = "Rio_OUTLINE_Public_AOI.geojson"
outline_key = os.path.join(building_labels_path, outline_name)
outline_filename = os.path.join(tmp_path, outline_name)

print("Downloading %s" % outline_key)
transfer.download_file(
    bucket=bucket, key=outline_key, filename=outline_filename,
    extra_args={"RequestPayer":"requester"})

print("Downloading Rio building footprints")
buildings_name = "Rio_Buildings_Public_AOI_v2.geojson"
buildings_key = os.path.join(building_labels_path, buildings_name)
buildings_filename = os.path.join("/tmp", buildings_name)

print("Downloading %s" % buildings_key)
transfer.download_file(
    bucket=bucket, key=buildings_key, filename=buildings_filename,
    extra_args={"RequestPayer":"requester"})

end = time.time()
seconds = end - start
minutes = seconds/60
print("Finishing download after %d minutes" % minutes)

Starting download
Downloading Rio imagery
Downloading AOI_1_Rio/srcData/mosaic_3band/013022223103.tif
Downloading AOI_1_Rio/srcData/mosaic_3band/013022223112.tif
Downloading AOI_1_Rio/srcData/mosaic_3band/013022223113.tif
Downloading AOI_1_Rio/srcData/mosaic_3band/013022223121.tif
Downloading AOI_1_Rio/srcData/mosaic_3band/013022223123.tif
Downloading AOI_1_Rio/srcData/mosaic_3band/013022223130.tif
Downloading AOI_1_Rio/srcData/mosaic_3band/013022223131.tif
Downloading AOI_1_Rio/srcData/mosaic_3band/013022223132.tif
Downloading AOI_1_Rio/srcData/mosaic_3band/013022223133.tif
Downloading AOI_1_Rio/srcData/mosaic_3band/013022223301.tif
Downloading AOI_1_Rio/srcData/mosaic_3band/013022223310.tif
Downloading AOI_1_Rio/srcData/mosaic_3band/013022223311.tif
Downloading AOI_1_Rio/srcData/mosaic_3band/013022232002.tif
Downloading AOI_1_Rio/srcData/mosaic_3band/013022232003.tif
Downloading AOI_1_Rio/srcData/mosaic_3band/013022232020.tif
Downloading AOI_1_Rio/srcData/mosaic_3band/013022232021.ti

In [3]:
print("Reading geojson of Rio outline to vector data")

from geonotebook.wrappers import VectorData
outline_vector = VectorData(outline_filename)
print("Centering map at centroid of Rio outline vector")
outline_polygons = [polygon for polygon in outline_vector.polygons]
outline_polygon = outline_polygons[0]
outline_centroid = outline_polygon.centroid
x = outline_centroid.x
y = outline_centroid.y
z = 11
M.set_center(x, y, z)
print("Adding layer of Rio outline vector")
M.add_layer(outline_vector, name=outline_name);

Reading geojson of Rio outline to vector data
Centering map at centroid of Rio outline vector
Adding layer of Rio outline vector


In [4]:
# buildings_vector = VectorData(buildings_filename)
# M.add_layer(building_vector, name=buildings_name);

In [4]:
print("Setting up Spark context for ingest of Rio imagery")
import geopyspark as gps
from pyspark import SparkContext
conf = gps.geopyspark_conf("local[*]", "spacenet-ingest")
sc = SparkContext.getOrCreate(conf)

Setting up Spark context for ingest of Rio imagery


In [None]:
print("Ingesting Rio imagery")
# Ingest takes X minutes.
import time
start = time.time()

from geopyspark.geotrellis.geotiff import get
from geopyspark.geotrellis.constants import LayerType
from geopyspark.geotrellis.catalog import write

# Read the GeoTiff locally
catalog_uri = "/tmp/spacenet-data/"
rdd = get(LayerType.SPATIAL, catalog_uri)
# Error: https://github.com/locationtech/geotrellis/issues/2268
metadata = rdd.collect_metadata()

# tile the rdd to the layout defined in the metadata
laid_out = rdd.tile_to_layout(metadata)

# reproject the tiled rasters using a ZoomedLayoutScheme
reprojected = laid_out.reproject("EPSG:3857").cache().repartition(200)

# pyramid the TiledRasterRDD to create 12 new TiledRasterRDDs
# one for each zoom level
pyramided = reprojected.pyramid(start_zoom=12, end_zoom=1)

# Save each TiledRasterRDD locally
for tiled in pyramided:
    write("file:///tmp/spacenet-catalog", "spacenet-ingest", tiled)

end = time.time()
ingest_time = end - start
minutes = int(ingest_time)/60
print("Ingest time: %d minutes" % minutes)