In [1]:
import shutil
import uuid
from pathlib import Path

import re
import requests
import boto3

import zipfile
import py7zr
import multivolumefile

from tqdm import tqdm

import numpy as np
import pandas as pd
import rasterio as rio
import skimage
import matplotlib.pyplot as plt

In [2]:
DATA = Path("C:/Users/hp/Desktop/datasets/urban-feature-extraction")
ROOT = Path("C:/Users/hp/Desktop/urban-feature-extraction")

### Base Class

In [36]:
class DatasetETL:
    def __init__(self, root:Path, urls:dict, image_dir:Path, mask_dir:Path, low_storage_mode:bool = True):
        download_dir = (root / "downloads") 
        if not root.exists():
            print("Root not found, creating directories")
            root.mkdir()
            download_dir.mkdir()
        self.root = root
        #Source URLs is Dict[file_name:str, url:str]
        self.source_urls:dict = urls

        self.low_storage_mode = low_storage_mode
        self.random_seed = 11235 

        if not download_dir.exists():
            print("Downloads directory DNE, creating new directory")
            download_dir.mkdir()
        self.download_dir = download_dir 

        (image_dir).mkdir(exist_ok=True, parents=True)
        self.image_dir = image_dir

        (mask_dir).mkdir(exist_ok=True, parents=True) 
        self.mask_dir = mask_dir

        self.cropped_image_dir = self.root / "cropped" / "images"
        (self.cropped_image_dir).mkdir(exist_ok=True)
        self.cropped_mask_dir = self.root / "cropped" / "masks"
        (self.cropped_mask_dir).mkdir(exist_ok=True)
        self.cropped_metadata_dir = self.root / "cropped" / "metadata"
        (self.cropped_metadata_dir).mkdir(exist_ok=True)


    #TODO Raise error if these are not implemented in child function
    def download(self):
        pass
    def extract(self):
        pass
    def catalog(self):
        pass

    def Extract(self):
        #Download and Stage Dataset on Disk.
        self.download()
        self.extract()
        self.catalog() 

    def Transform(self):
        pass
    def Load(self):
        pass

    def clear_downloads_directory(self):
        downloaded_files:list = [path for path in self.download_dir.iterdir()]
        self.delete_files(downloaded_files)

    #Internal Methods

    def _get_source_urls(self, urls_list:list) -> dict:
        #List[url:str] -> Dict[file_name:str, url:str]

        return {Path(url).name : url for url in urls_list}

    def _download_file(self, url:str, file_path:Path, chunk_size:int = 1024*1024):
        #Download from url and save to disk at file_path
        
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            total_size = int(r.headers.get('content-length', 0))
            with open(file_path, "wb") as f, tqdm(total=total_size, unit="B", unit_scale=True, desc="Downloading") as progress_bar:
                for chunk in r.iter_content(chunk_size=chunk_size):
                    if chunk: 
                        f.write(chunk)
                        progress_bar.update(len(chunk))

    #TODO: Use async/await for concurrent downloads 
    def _download_source_urls(self, download_dir:Path = Path("")):
        if download_dir == Path(""):
            download_dir = self.download_dir
        #Download files from self.source_urls, skip if already_downloaded
        print(download_dir)
        for file_name, url in self.source_urls.items():
            file_path = download_dir / file_name 

            if file_path.exists():
                #TODO: Check for downloaded file size as well
                print(f"{file_name} Already Downloaded, Skipping")
                continue
            self._download_file(url, file_path)

    def _extract_zip(self, zip_file_path:Path, target_dir:Path, dirs_to_be_extracted = list()):
        #Extract specified dirs from zip archive, extract all dirs if not specified

        with zipfile.ZipFile(zip_file_path, 'r') as zip:
            #If dirs_to_be_extracted is an empty list, extract entire archive and exit
            if not dirs_to_be_extracted:
                zip.extractall(target_dir); return
            #Otherwise, extract all files under specified dirs
            #For each file in archive, extract if it's under any specified dir
            for member in zip.infolist():
                for foldername in dirs_to_be_extracted:
                    if foldername in member.filename:
                        #TODO: Add tqdm progress bar for extraction
                        zip.extract(member, target_dir)
   
    def _validate_files(self, dir:Path, files:list) -> list:
        ##TODO: Check for downloaded file size on disk against actual size
        missing = list()
        for file_name in files:
            if not(dir/file_name).exists():
                missing.append(file_name)
        return missing

    def _validate_download(self) -> list:
        return self._validate_files(self.download_dir, self.source_urls.keys())
        
    def _delete_files(self, file_paths:list):
        #Delete list of files if they exist, print warnings if they dont. 
        for file_path in file_paths:
            if file_path.exists():
                file_path.unlink()
            else:
                print(f"Error Deleting {file_path.name}")
                print(f"File Does Not Exist")

    def _get_raster_metadata(self, raster_path:Path):
        #Return Shape, Reference Frame and Transformation Matrix of a Raster File
        with rio.open(raster_path) as raster:
            return raster.shape, str(raster.crs), tuple(raster.transform)
    
    def _complete_catalog(self, df:pd.DataFrame) -> None:
        """Add metadata columns to catalog and assign self.downloaded_dataset"""

        metadata = df.name.apply(lambda x: self._get_raster_metadata((self.mask_dir/x)))
        df[["shape", "crs", "transform"]] = pd.DataFrame(metadata.tolist(), index = df.index) 
        #df[["shape", "crs", "transform"]] = metadata.apply(pd.Series) 
        #df["image_path"] = df.name.apply(lambda x: self.image_dir / x)
        #df["mask_path"] = df.name.apply(lambda x: self.mask_dir / x)
        self.downloaded_dataset = df.copy(deep = True)

    def _read_image(self, path):
        return skimage.io.imread(path) 

    def _read_mask(self, path):
        return skimage.io.imread(path) 

    def _get_pad_amount(self, dimension: int, window: int):
        """Calculate to no of pixels to add to before and after dimension"""
        total_padding = window - (dimension % window)

        if total_padding % 2 == 0:
            after = total_padding // 2
            before = after
        else:
            after = (total_padding // 2) + 1
            before = after - 1
        assert before+after == total_padding 
        return (before, after)
    
    def _pad_3d_array(self, array: np.ndarray, window: int):
        """
        Pad image array s.t. divisible by window\n
        array.shape : (Height, Width, Channels)
        window : side length of square cropping window
        """

        assert array.ndim == 3
        padded_array = np.pad(
            array = array,
            pad_width = (self._get_pad_amount(array.shape[0], window),
                         self._get_pad_amount(array.shape[1], window),
                         (0, 0))
        ) 
        return padded_array
    
    def _get_cropped_view(self, array: np.ndarray, window:int):
        """
        Crop image array s.t. divisible by window\n
        array.shape : (Height, Width, Channels)
        window : side length of square cropping window
        """

        assert array.ndim == 3
        cropped_view = skimage.util.view_as_windows(
            arr_in = array,
            window_shape = (window, window, array.shape[2]),
            step =  (window, window, array.shape[2])).squeeze()
            
        cropped_view = cropped_view.reshape(-1, window, window, array.shape[2])

        return cropped_view

    def _crop_one_scene(self, tile_path: str, window: int, read_scene):
        scene = read_scene(tile_path) 
        scene = self._pad_3d_array(scene, window)
        scene = self._get_cropped_view(scene, window)
        return scene

    def _save_as_jpeg_100(self, array: np.ndarray, out_path: Path) -> None:
        skimage.io.imsave((out_path.parent / f"{out_path.stem}.jpg"), array, check_contrast = False, **{"quality": 100})

### Inria

In [39]:
class InriaETL(DatasetETL):
    def __init__(self, root:Path, low_storage_mode:bool = True):

        self.locations = ["austin", "chicago", "kitsap", "tyrol-w", "vienna"]
        self.files_list = [f"{location}{num}.tif" for location in self.locations for num in range(1, 37)]
        dataset_dir = root / "AerialImageDataset" / "train"

        super().__init__(
            root = root, 
            urls = self._get_source_urls([
                    "https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.001",
                    "https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.002",
                    "https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.003",
                    "https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.004",
                    "https://files.inria.fr/aerialimagelabeling/aerialimagelabeling.7z.005"
                                        ]),
            image_dir = dataset_dir / "images",
            mask_dir = dataset_dir / "gt",
            low_storage_mode = low_storage_mode
        )

    def download(self):
        self._download_source_urls(self.download_dir)
    
    def extract(self):
        #Verify Download
        missing_volumes = self._validate_download()
        if missing_volumes:
            print("Missing Volumes")
            print("Please Download Missing Volumes")
            print(missing_volumes)
            return
        else:
            print("Found All Volumes")


        #Merge and Extract Dataset Zip
        multivolume_7zip_path = self.download_dir / "aerialimagelabeling.7z" 
        self._extract_multivolume_archive(multivolume_7zip_path, self.download_dir)

        #Delete downloaded volumes        
        if self.low_storage_mode:
            print("Deleting downloaded volumes to save storage space")
            #self._delete_files(self.download_dir.glob("*.7z.*"))
        
        dataset_zip_path = self.download_dir / "NEW2-AerialImageDataset.zip" 
        print(f"Extracting Dataset Folder from {dataset_zip_path}")
        self._extract_zip(dataset_zip_path, self.root, ["train"])

        missing_images, missing_masks = self._validate_extraction()
        if missing_images or missing_masks:
            print(f"Images Not Found: {missing_images}")
            print(f"Masks Not Found: {missing_masks}")
        else:
            print("Extraction Complete")
            print("Deleting dataset archive")
            self._delete_files([dataset_zip_path])

    def catalog(self):
        df = pd.DataFrame({"name": self.files_list,
                           "split": list(map(self._get_split, self.files_list))})
        self._complete_catalog(df)

    def crop(self, window: int):
        for (idx, tile) in self.downloaded_dataset.iterrows():

            if tile["split"] == "test":
                continue

            cropped_image_view = self._crop_one_scene(
                tile_path = self.image_dir / tile["name"],
                window = window,
                read_scene = self._read_image
            )

            cropped_mask_view = self._crop_one_scene(
                tile_path = self.mask_dir / tile["name"],
                window = window,
                read_scene = self._read_mask
            )
    
            for image_crop, mask_crop in zip(cropped_image_view, cropped_mask_view):
                crop_name = str(uuid.uuid4())
                self._save_as_jpeg_100(image_crop , (self.cropped_image_dir / crop_name)) 
                self._save_as_jpeg_100(mask_crop.squeeze(), (self.cropped_mask_dir / crop_name))

    #Internal Methods
    def _extract_multivolume_archive(self, multivolume_file_path:Path, target_dir:Path) -> None:
        """Extract contents of a multivolume 7zip archive""" 

        with multivolumefile.open(multivolume_file_path, mode = 'rb') as multi_archive:
            with py7zr.SevenZipFile(multi_archive, 'r') as archive:
                archive.extractall(path = target_dir)
    
    def _validate_extraction(self) -> tuple:
        return (self._validate_files(self.image_dir, self.files_list),
                self._validate_files(self.mask_dir, self.files_list))

    def _get_split(self, file_name:str):
        """First 6 (16.67%) in every region for testing """
        numbers = [char for char in file_name if char.isdigit()]
        if int(''.join(numbers)) <= 6:
            return "test"
        return "train"
    
    def _read_mask(self, path:str):
        mask = skimage.io.imread(path) 
        return np.expand_dims(mask, -1)

In [40]:
inria = InriaETL(DATA / "inria")
inria.catalog()

In [41]:
inria.crop(512)

### Massachussets

In [8]:
class MassachussetsETL(DatasetETL):
    mass_urls = {
        "train": ("https://www.cs.toronto.edu/~vmnih/data/mass_buildings/train/sat/index.html", "https://www.cs.toronto.edu/~vmnih/data/mass_buildings/train/map/index.html"),
        "test" : ("https://www.cs.toronto.edu/~vmnih/data/mass_buildings/test/sat/index.html", "https://www.cs.toronto.edu/~vmnih/data/mass_buildings/test/map/index.html"),
        "val": ("https://www.cs.toronto.edu/~vmnih/data/mass_buildings/valid/sat/index.html", "https://www.cs.toronto.edu/~vmnih/data/mass_buildings/valid/map/index.html"),
    }
    def __init__(self, root:Path):
        super().__init__(root, self._get_source_urls())
        self.image_dir = self.root / "images"
        self.image_dir.mkdir(exist_ok=True)
        self.mask_dir = self.root / "masks"
        self.mask_dir.mkdir(exist_ok=True)
        self.files_list = [url.split("/")[-1] for url in self.source_urls[0]]

    def _get_file_urls(self, url) -> list:
        response = requests.get(url)
        if response.status_code == 200:
            pattern = r'<a\s+(?:[^>]*?\s+)?href="([^"]*)"'
            matches = re.findall(pattern, response.text)
            return matches
        else:
            print("Error:", response.status_code)
            return []

    def _get_source_urls(self) -> tuple:
        images = sum([self._get_file_urls(self.mass_urls[key][0]) for key in self.mass_urls.keys()], [])
        masks = sum([self._get_file_urls(self.mass_urls[key][1]) for key in self.mass_urls.keys()], [])
        return images, masks

    def download(self, urls:list, target_dir_path:Path):
        for url in urls:
            downloaded_file_path = target_dir_path / url.split("/")[-1]
            if downloaded_file_path.exists():
                print(f"{url.split('/')[-1]} Exists, Skipping")
                continue
            self.download_file(url, downloaded_file_path)
    
    def download_images(self) -> None:
        self.download(self.source_urls[0], self.image_dir)
        
    def download_masks(self) -> None:
        self.download(self.source_urls[1], self.image_dir)
        
    def _validate_download_dir(self, target_dir_path:Path) -> list:
        files_not_downloaded = list()
        for file_name in self.files_list:
            downloaded_file_path = target_dir_path / file_name
            if not downloaded_file_path.exists():
                files_not_downloaded.append(downloaded_file_path)
        return files_not_downloaded
    
    def validate_images(self) -> None:
        return self._validate_download_dir(self.image_dir)

    def validate_masks(self) -> None:
        return self._validate_download_dir(self.mask_dir)

    def create_dataframe(self):
        metadata = [self.get_raster_metadata((self.mask_dir / x)) for x in self.files_list] 
        shapes, crses, transforms = zip(*metadata)
        data = {
            "name": self.files_list,
            "shape": shapes, 
            "crs": crses,
            "transform": transforms,
            "image_path": [(self.image_dir / x) for x in self.files_list],
            "mask_path": [(self.mask_dir / x) for x in self.files_list]
        }
        return pd.DataFrame(data)

### ISPRS

In [10]:
class ISPRSSemanticLabelingETL(DatasetETL):
    isprs_urls = {
        "potsdam.zip": "https://seafile.projekt.uni-hannover.de/f/429be50cc79d423ab6c4/",
        #"toronto.zip": "https://seafile.projekt.uni-hannover.de/f/fc62f9c20a8c4a34aea1/",
        "vaihingen.zip": "https://seafile.projekt.uni-hannover.de/f/6a06a837b1f349cfa749/",
    }
    password = "CjwcipT4-P8g"
    cookie_name = "sfcsrftoken"

    def __init__(self, root:Path, low_storage_mode:bool = True):
        super().__init__(root = root, 
                         urls = self.isprs_urls, 
                         image_dir = root / "images",
                         mask_dir = root / "masks"
        )

    def _download_file(self, url:str, file_path:str, chunk_size = 1024*1024) -> None:
        session = requests.Session()
        cookies = {self.cookie_name: session.get(url).cookies.get(self.cookie_name)}
        payload:dict = {'csrfmiddlewaretoken': cookies[self.cookie_name], 
                        'password': self.password}
        with requests.post(url+"?dl=1", data = payload, cookies = cookies, stream = True) as r:
            r.raise_for_status()
            total_size = int(r.headers.get('content-length', 0))
            with open(file_path, "wb") as f, tqdm(total=total_size, unit="B", unit_scale=True, desc="Downloading") as progress_bar:
                for chunk in r.iter_content(chunk_size=chunk_size):
                    if chunk: 
                        f.write(chunk)
                        progress_bar.update(len(chunk))

    def download(self):
        self._download_source_urls() 
    
    def extract(self):
        self._extract_vaihingen()
        self._extract_potsdam()
    
    def catalog(self):
        mask_file_names = set([x.name for x in self.mask_dir.glob("*.tif")])
        image_file_names = set([x.name for x in self.image_dir.glob("*.tif")])
        print(mask_file_names - image_file_names)

    def _extract_vaihingen(self):
        vaihingen_zip_path = self.download_dir / "vaihingen.zip"
        dataset_zip_path = self.download_dir / "Vaihingen" / "ISPRS_semantic_labeling_Vaihingen.zip"
        self._extract_zip(vaihingen_zip_path, self.download_dir, [dataset_zip_path.name])
        if dataset_zip_path.exists():
            print("Dataset Zip Extracted")
        if self.low_storage_mode:
            #self._delete_files([vaihingen_zip_path])
            print("Downloaded Zip Deleted")

        self._extract_zip(dataset_zip_path, self.root, ["top", "gts_for_participants"])
        print("Dataset Extracted")
        if self.low_storage_mode:
            #self._delete_files([dataset_zip_path])
            #dataset_zip_path.parent.rmdir()
            print("Dataset Zip Deleted")
        shutil.move(self.root / "top", self.image_dir)
        shutil.move(self.root / "gts_for_participants", self.mask_dir)
    
    def _extract_potsdam(self):
        potsdam_zip_path = self.download_dir / "potsdam.zip"
        images_zip_path = self.download_dir / "Potsdam" / "2_Ortho_RGB.zip"
        masks_zip_path = self.download_dir / "Potsdam" / "5_Labels_all.zip"

        #self._extract_zip(potsdam_zip_path, self.download_dir, [images_zip_path.name, masks_zip_path.name])

        images_temp_dir = images_zip_path.parent / images_zip_path.stem
        #self._extract_zip(images_zip_path, images_zip_path.parent)

        masks_temp_dir = masks_zip_path.parent / masks_zip_path.stem
        (masks_temp_dir).mkdir(exist_ok=True)
        #self._extract_zip(masks_zip_path, masks_temp_dir)

        #Copy Images, Masks to Correct Directories
        for image_path in images_temp_dir.glob("*.tif"):
            shutil.move(image_path, self.image_dir)
        for mask_path in masks_temp_dir.glob("*.tif"):
            shutil.move(mask_path, self.mask_dir)

In [11]:
isprs = ISPRSSemanticLabelingETL(DATA / "isprs")

In [12]:
isprs.catalog()

{'top_potsdam_2_12_label.tif', 'top_potsdam_6_13_label.tif', 'top_potsdam_4_12_label.tif', 'top_potsdam_3_13_label.tif', 'top_potsdam_6_9_label.tif', 'top_potsdam_4_13_label.tif', 'top_potsdam_6_12_label.tif', 'top_potsdam_7_8_label.tif', 'top_potsdam_3_11_label.tif', 'top_potsdam_5_15_label.tif', 'top_potsdam_4_15_label.tif', 'top_potsdam_4_10_label.tif', 'top_potsdam_5_10_label.tif', 'top_potsdam_7_9_label.tif', 'top_potsdam_3_10_label.tif', 'top_potsdam_6_11_label.tif', 'top_potsdam_3_14_label.tif', 'top_potsdam_7_7_label.tif', 'top_potsdam_5_12_label.tif', 'top_potsdam_7_10_label.tif', 'top_potsdam_2_14_label.tif', 'top_potsdam_4_11_label.tif', 'top_potsdam_7_13_label.tif', 'top_potsdam_7_12_label.tif', 'top_potsdam_6_8_label.tif', 'top_potsdam_5_11_label.tif', 'top_potsdam_6_14_label.tif', 'top_potsdam_6_7_label.tif', 'top_potsdam_5_14_label.tif', 'top_potsdam_7_11_label.tif', 'top_potsdam_2_13_label.tif', 'top_potsdam_5_13_label.tif', 'top_potsdam_4_14_label.tif', 'top_potsdam_6_

### City OSM

In [177]:
class CityOSMETL(DatasetETL):
    city_osm_urls = {
        "berlin.zip": "https://zenodo.org/record/1154821/files/berlin.zip?download=1",
        "chicago.zip": "https://zenodo.org/record/1154821/files/chicago.zip?download=1",
        "paris.zip": "https://zenodo.org/record/1154821/files/paris.zip?download=1",
        "potsdam.zip": "https://zenodo.org/record/1154821/files/potsdam.zip?download=1",
        "tokyo.zip": "https://zenodo.org/record/1154821/files/tokyo.zip?download=1",
        "zurich.zip": "https://zenodo.org/record/1154821/files/zurich.zip?download=1"
    }
    def __init__(self, root:Path):
        super().__init__(root, 
                         self.city_osm_urls, 
                         image_dir = root / "images", 
                         mask_dir = root / "masks")
    
    def download(self):
        self._download_source_urls()
    
    def extract(self):
        #Extract All Files
        for zip_file_name in self.source_urls.keys():
            zip_file_path = self.download_dir / zip_file_name
            self._extract_zip(zip_file_path, self.image_dir, ["image"])
            self._extract_zip(zip_file_path, self.mask_dir, ["labels"])

        self._move_and_rename_files(self.image_dir.rglob("*.png"))
        self._move_and_rename_files(self.mask_dir.rglob("*.png"))

        #TODO: Remove Empty Directories
        #for dir_path in self.image_dir.glob('**/'):
            #if dir_path.exists() and not any(dir_path.iterdir()):
                #dir_path.rmdir()
        #for dir_path in self.mask_dir.glob('**/'):
            #if dir_path.exists() and not any(dir_path.iterdir()):
                #dir_path.rmdir()

    def catalog(self):
        self.file_names = list(map(lambda x: x.name, self.mask_dir.rglob("*.png")))
        df = pd.DataFrame({"name": self.file_names})
        df["split"] = df.apply(lambda x: self._get_split(), axis = 1) 
        self._complete_catalog(df)
        
    def _move_and_rename_files(self, file_paths) -> None:
        for file_path in file_paths:
            shutil.move(file_path, file_path.parents[1] / f"{file_path.stem.split('_')[0]}.png")
    
    def _get_split(self):
        """Train-Test Split with 15% Probability"""
        return np.random.choice(["train", "test"], p = [.84, .16])

In [178]:
city_osm = CityOSMETL(DATA / "city-osm")