In [12]:
import os
import shutil
from pathlib import Path
import requests

import boto3
from botocore.config import Config

import asyncio
import aiohttp
import aiofiles

#import nest_asyncio 
#nest_asyncio.apply()

import zipfile
import py7zr
import multivolumefile

import uuid
import skimage

import numpy as np
import pandas as pd

from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio

In [13]:
#DATA = Path("/media/sambhav/30AC4696AC46568E/datasets/urban-feature-extraction")
DATA = Path("C:/Users/hp/Desktop/datasets/urban-feature-extraction")

In [14]:
class DatasetDownloader:
    def __init__(self, downloads:Path, urls:dict):

        # init required directories
        if not (downloads.exists() and downloads.is_dir()):
            downloads.mkdir(exist_ok=True, parents=True)
            print(f"download directory at {downloads}")
        self.download_dir = downloads

        # src_urls is a dictionary such that
        # src_urls[filename:str]  = url:str
        self.src_urls = urls
        
    async def download_one_file(self, session, url:str, file_path:Path):
        """Download one file from url and save to disk at file_path"""
        #TODO: How to use tqdm for each coroutine in the notebook
        async with session.get(url, ssl = False) as r:
            #total_size = int(r.headers.get('content-length', 0))
            async with aiofiles.open(file_path, "wb") as f:
                #progress_bar = tqdm(total=total_size, unit="B", unit_scale=True, desc="Downloading")
                async for chunk in r.content.iter_any():
                    await f.write(chunk)
                    #progress_bar.update(len(chunk))

    async def download_files(self) -> None:
        #Download files from self.src_urls, skip if already_downloaded
        timeout = aiohttp.ClientTimeout(total = None)
        async with aiohttp.ClientSession(timeout=timeout) as session:
            coroutines = list()
            for file_name, url in self.src_urls.items():
                file_path = self.download_dir / file_name 
                coroutines.append(self.download_one_file(session, url, file_path))
            await tqdm_asyncio.gather(*coroutines)   
    
    def validate_download(self, downloaded_file_sizes: dict) -> None:
        #TODO: Implement Validate Downloads
        pass

In [15]:
class DatasetExtractor:

    @staticmethod
    def extract_zip_archive(zip_file_path:Path, target_dir:Path, dirs_to_be_extracted = list()):
        """Extract specified contents from zip archive, extract all contents if not specified"""

        with zipfile.ZipFile(zip_file_path, 'r') as zip:
            #If dirs_to_be_extracted is an empty list, extract entire archive and exit
            if not dirs_to_be_extracted:
                zip.extractall(target_dir); return
            #Otherwise, extract all files under specified dirs
            #For each file in archive, extract if it's under any specified dir
            for member in zip.infolist():
                for foldername in dirs_to_be_extracted:
                    if foldername in member.filename:
                        #TODO: Add tqdm progress bar for extraction
                        zip.extract(member, target_dir)

    @staticmethod    
    def extract_multivolume_archive(multivolume_file_path:Path, target_dir:Path) -> None:
        """Extract all contents of a multivolume 7zip archive""" 

        with multivolumefile.open(multivolume_file_path, mode = 'rb') as multi_archive:
            with py7zr.SevenZipFile(multi_archive, 'r') as archive: # type: ignore
                archive.extractall(path = target_dir)

    @staticmethod
    def validate_extraction(val_dir:Path, val_files: list):
        """Check if val_dir contains all files listed under val_files, return list of missing files"""
        pass

In [16]:
class CropUtil():

    def _read_image(self, path):
        return skimage.io.imread(path) 

    def _read_mask(self, path):
        return skimage.io.imread(path) 

    def _get_pad_amount(self, dimension: int, window: int):
        """Calculate to no of pixels to add to before and after dimension"""
        total_padding = window - (dimension % window)

        if total_padding % 2 == 0:
            after = total_padding // 2
            before = after
        else:
            after = (total_padding // 2) + 1
            before = after - 1
        assert before+after == total_padding 
        return (before, after)
    
    def _pad_3d_array(self, array: np.ndarray, window: int):
        """
        Pad image array s.t. divisible by window\n
        array.shape : (Height, Width, Channels)
        window : side length of square cropping window
        """

        assert array.ndim == 3
        padded_array = np.pad(
            array = array,
            pad_width = (self._get_pad_amount(array.shape[0], window),
                         self._get_pad_amount(array.shape[1], window),
                         (0, 0))
        ) 
        return padded_array
    
    def _get_cropped_view(self, array: np.ndarray, window:int):
        """
        Crop image array s.t. divisible by window\n
        array.shape : (Height, Width, Channels)
        window : side length of square cropping window
        """

        assert array.ndim == 3
        cropped_view = skimage.util.view_as_windows(
            arr_in = array,
            window_shape = (window, window, array.shape[2]),
            step =  (window, window, array.shape[2])).squeeze()
            
        cropped_view = cropped_view.reshape(-1, window, window, array.shape[2])

        return cropped_view

    def _crop_one_scene(self, tile_path: Path, window: int, read_scene):
        scene = read_scene(tile_path) 
        scene = self._pad_3d_array(scene, window)
        scene = self._get_cropped_view(scene, window)
        return scene

    def _save_as_jpeg_100(self, array: np.ndarray, out_path: Path) -> None:
        skimage.io.imsave((out_path.parent / f"{out_path.stem}.jpg"), array, check_contrast = False, **{"quality": 100})

In [17]:
class DatasetUploader():
    def __init__(self, bucket_name):
        self.init_boto3()
        self.s3resource = boto3.resource('s3')
        self.bucket = self.s3resource.Bucket(bucket_name)

        self.prefix = {
            "train_image": "labelled/patches/images",
            "train_mask": "labelled/patches/masks",

            "test_image": "labelled/scenes/images",
            "test_mask": "labelled/scenes/masks",
        }
    
    def init_boto3(self, aws_dir = None):
        if not aws_dir: 
            aws_dir = (Path.home() / ".aws")
        os.environ["AWS_CONFIG_FILE"] = (aws_dir / "config").as_posix()
        #os.environ["AWS_SHARED_CREDENTIALS_FILE"] = (aws_dir / "credentials").as_posix()
    
    def list_files_in_bucket(self):
        for file_obj in self.bucket.objects.all():
            print(file_obj.key)
    
    def upload_patch(self, patch_path:Path, prefix:str):
        assert patch_path.exists() and patch_path.is_file()

        patch_key = (Path(prefix) / patch_path.name).as_posix()
        self.bucket.Object(patch_key).upload_file(patch_path)
        print(f"Uploaded to {self.bucket.name}/{patch_key}")
    
    def upload_train_pair(self, file_name, image_dir, mask_dir):
        self.upload_patch(
            patch_path = (image_dir / file_name),
            prefix = self.prefix["train_image"]
        )
        self.upload_patch(
            patch_path = (mask_dir / file_name),
            prefix = self.prefix["train_mask"]
        )

    def upload_test_pair(self, file_name, image_dir, mask_dir):
        self.upload_patch(
            patch_path = (image_dir / file_name),
            prefix = self.prefix["test_image"]
        )
        self.upload_patch(
            patch_path = (mask_dir / file_name),
            prefix = self.prefix["test_mask"]
        )

In [21]:
class InriaETL:
    urls = {
        "aerialimagelabeling.7z.001" : "https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.001",
        "aerialimagelabeling.7z.002" :   "https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.002",
        "aerialimagelabeling.7z.003" :  "https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.003",
        "aerialimagelabeling.7z.004" :  "https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.004",
        "aerialimagelabeling.7z.005" :  "https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.005"
    }

    sizes = {
        "aerialimagelabeling.7z.001" : 0,
        "aerialimagelabeling.7z.002" : 0,
        "aerialimagelabeling.7z.003" : 0,
        "aerialimagelabeling.7z.004" : 0,
        "aerialimagelabeling.7z.005" : 0
    }

    locations = ["austin", "chicago", "kitsap", "tyrol-w", "vienna"]
    test_files_list = [f"{location}{num}.tif" for location in locations for num in range(1, 7)]
    train_files_list = [f"{location}{num}.tif" for location in locations for num in range(7, 37)]

    def __init__(self, root: Path):
        self.root_dir = root
        self.download_dir = root / "downloads"

        # Downloaded Images, Masks
        self.d_dataset_dir = root / "AerialImageDataset" / "train"
        self.d_image_dir = self.d_dataset_dir / "images"
        self.d_mask_dir = self.d_dataset_dir / "gt" 

        # Test Images, Masks
        self.t_image_dir = self.root_dir / "test" / "images"
        self.t_mask_dir = self.root_dir / "test" / "masks"

        #Images, Masks
        self.image_dir = self.root_dir / "images"
        self.mask_dir = self.root_dir / "masks"

        
    async def download(self):
        downloader = DatasetDownloader(self.download_dir, self.urls)
        await asyncio.create_task(downloader.download_files())
        downloader.validate_download()
    
    def extract(self, low_storage_mode:bool):
        extractor = DatasetExtractor()

        #Merge and Extract Dataset Zip
        multivolume_7zip_path = self.download_dir / "aerialimagelabeling.7z" 
        dataset_zip_path = self.download_dir / "NEW2-AerialImageDataset.zip" 

        extractor.extract_multivolume_archive(multivolume_7zip_path, self.download_dir)
        print(f"Extracted Multivolume Archive to {dataset_zip_path}")

        if low_storage_mode:
            print("Deleting downloaded multi-volume to save storage space")
            for volume in self.download_dir.glob("aerialimagelabeling.7z.*"):
                volume.unlink()

        extractor.extract_zip_archive(dataset_zip_path, self.root_dir, ["train"])
        print(f"Extracted Dataset Archive to {self.root_dir}")

        if low_storage_mode:
            print("Deleting extracted dataset archive to save storage space")
            dataset_zip_path.unlink()

        #extractor.validate_extraction(files_list, dataset_dir)

    def move_test_split(self):
        self.t_image_dir.mkdir(exist_ok=True, parents=True)
        self.t_mask_dir.mkdir(exist_ok=True, parents=True)

        for file_name in self.test_files_list:
            shutil.move((self.d_image_dir / file_name), self.t_image_dir)
            shutil.move((self.d_mask_dir / file_name), self.t_mask_dir)
    
    @staticmethod
    def read_mask(path:Path):
        return np.expand_dims(skimage.io.imread(path), -1)

    def crop(self, window: int):
        self.image_dir.mkdir(exist_ok=True)
        self.mask_dir.mkdir(exist_ok=True)

        cropper = CropUtil()

        for file_name in self.train_files_list:

            cropped_image_view = cropper._crop_one_scene(
                tile_path = self.d_image_dir / file_name,
                window = window,
                read_scene = cropper._read_image
            )

            cropped_mask_view = cropper._crop_one_scene(
                tile_path = self.d_mask_dir / file_name,
                window = window,
                read_scene = self.read_mask 
            )
    
            for image_crop, mask_crop in zip(cropped_image_view, cropped_mask_view):
                crop_name = str(uuid.uuid4())
                cropper._save_as_jpeg_100(image_crop , (self.image_dir / crop_name)) 
                cropper._save_as_jpeg_100(mask_crop.squeeze(), (self.mask_dir / crop_name))

    def delete_downloaded_dataset(self):
        shutil.rmtree(self.d_dataset_dir)
        self.d_dataset_dir.parent.rmdir()
    
    def upload(self):
        uploader = DatasetUploader('urban-feature-extraction')

        count = 0

        for file_path in self.mask_dir.glob("*.jpg"):

            if count >= 10:
                break
            count+=1

            uploader.upload_train_pair(file_path.name, self.image_dir, self.mask_dir) 

In [22]:
inria = InriaETL(DATA / "inria")
#await inria.download()
#inria.extract(low_storage_mode=True)
#inria.crop(512)
#inria.move_test_split()
#inria.delete_downloaded_dataset()

In [23]:
inria.upload()

Uploaded to urban-feature-extraction/labelled/patches/images/00004249-2209-481b-b3fa-387b64e80bfe.jpg
Uploaded to urban-feature-extraction/labelled/patches/masks/00004249-2209-481b-b3fa-387b64e80bfe.jpg
Uploaded to urban-feature-extraction/labelled/patches/images/0002968f-c3a9-41be-b4d5-f4bcb9e36f90.jpg
Uploaded to urban-feature-extraction/labelled/patches/masks/0002968f-c3a9-41be-b4d5-f4bcb9e36f90.jpg
Uploaded to urban-feature-extraction/labelled/patches/images/0004b3fd-87dd-4619-b2b9-67ec654f637a.jpg
Uploaded to urban-feature-extraction/labelled/patches/masks/0004b3fd-87dd-4619-b2b9-67ec654f637a.jpg
Uploaded to urban-feature-extraction/labelled/patches/images/00082786-4816-49a1-b1dc-9e073408f3e8.jpg
Uploaded to urban-feature-extraction/labelled/patches/masks/00082786-4816-49a1-b1dc-9e073408f3e8.jpg
Uploaded to urban-feature-extraction/labelled/patches/images/00083190-9ff5-473d-a9c7-cd7108d14bb0.jpg
Uploaded to urban-feature-extraction/labelled/patches/masks/00083190-9ff5-473d-a9c7-cd