# Foundation of Deep Learning
Group Challenge

***
by: Oskar Girardin (B00792974), Lasse Schmidt (B00792989)

within: MS Data Sciences & Business Analytics

at: CentraleSupélec & ESSEC Business School
***

### 1. Import Packages

In [145]:
# parse & handle data
import os
import glob
import json
import numpy as np
from PIL import Image

# pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torchinfo import summary

# evaluation metrics
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay, classification_report

# visualization
import matplotlib.pyplot as plt
from tqdm import tqdm # Progress bar
from pprint import pprint

### 2. Definition of Useful Functions (to be used in other notebooks)

#### 2.1 get path of this notebook

In [146]:
def get_notebook_path():
    return os.path.abspath("")

#### 2.2 get class values and labels

In [32]:
def get_class_dict(path_metadata):
    """
    Get the class labels and corresponding names within a Python dict.
    
    Input
    -----
    metadata: path (String) to provided metadata.json file (from downloaded data)
    
    Ouput
    -----
    class_dict: Python dict with class value as key and class label as value (e.g. 0: "Property Roof")
    """
    # fetch metadata file
    fin = open(path_metadata, 'r')
    metadata = json.load(fin)
    fin.close()
    # create class_dict
    class_dict = {}
    for idx, val in enumerate(metadata["label:metadata"][0]["options"]):
        class_dict[idx] = val
        
    return class_dict

#### 2.3 get training & test data

attention: some images are 3000x4000 others are 3072x4592!

In [159]:
def get_train_test_doc_paths(X_path, y_path, notebook_path):
    """
    Function that retrieves the global paths (as string )
    
    Input
    -----
    X_path: string that identifies folder of input data (the .tif images in the raw folder)
    
    y_path: string that identifies folder of output data (.png masks are only available for training data)
    
    notebook_path: directory of this notebook (as string)
    
    
    Output
    -----
    X_train_paths: Python dict of training data where each key refers to the input_path
    
    y_train_paths: Python dict of training data where each key refers to the output_path  
    
    X_test_paths: Python dict of test data where each key refers to the input_path
    
    """  
    X_train_paths, y_train_paths, X_test_paths = {}, {}, {}

    for f in glob.glob(X_path + '*.tif'):
        
        X_filename = os.path.basename(f) # get filename (e.g. 6411.tif) of data
        y_filename = X_filename[:-4] + '.png' # get filename of corresponding mask (e.g. 6411.png)
        
        key = X_filename[:-4] # key that we will insert in the dicts
        
        if os.path.exists(y_path + y_filename): # if this file has a mask, it's training data
            X_train_paths[key] = X_path + X_filename
            y_train_paths[key] = y_path + y_filename
            
        else: # otherwise test data
            X_test_paths[key] = X_path + X_filename
            
    print(f"Number of images for training: {len(X_train_paths)}")
    print(f"Number of images for test: {len(X_test_paths)}")
            
    return X_train_paths, y_train_paths, X_test_paths


def get_data_as_np_array(paths, desired_shape):
    """
    Retrieve image / mask data based on a Python dict of the corresponding paths.
    
    Input
    -----
    paths: Python dict of paths of the image / mask data that should be retrieved (key is image name, value is path)
    
    desired_shape: tuple of int (width, height) that denotes the desired shape of the image
    
    """
    original_shape = {}
    data = []
    
    for key, path in paths.items():
        # retrieve image and save its size
        img = Image.open(path)
        original_shape[key] = img.size
        
        # if image not of desired shape, resize it
        if img.size != desired_shape:
            img = img.resize(desired_shape, resample = Image.Resampling.NEAREST)
        
        data.append(np.asarray(img)) # save image (of desired shape)
        
    data = np.stack(data, axis = 0) # convert list of numpy arrays into numpy array
    
    return original_shape, data

### 3. How to run above defined functions (examples)

In [160]:
notebook_path = get_notebook_path()

In [161]:
get_class_dict(notebook_path + '\Hurricane_Harvey\\vectors\\random-split-_2022_11_17-22_35_45\CSV\metadata.json')

{0: 'Property Roof',
 1: 'Secondary Structure',
 2: 'Swimming Pool',
 3: 'Vehicle',
 4: 'Grass',
 5: 'Trees / Shrubs',
 6: 'Solar Panels',
 7: 'Chimney',
 8: 'Street Light',
 9: 'Window',
 10: 'Satellite Antenna',
 11: 'Garbage Bins',
 12: 'Trampoline',
 13: 'Road / Highway',
 14: 'Under Construction / In Progress Status',
 15: 'Power Lines & Cables',
 16: 'Bridge',
 17: 'Water Tank / Oil Tank',
 18: 'Parking Area - Commercial',
 19: 'Sports Complex / Arena',
 20: 'Industrial Site',
 21: 'Dense Vegetation / Forest',
 22: 'Water Body',
 23: 'Flooded',
 24: 'Boat',
 25: 'Parking Area'}

In [162]:
X_path = notebook_path + '\Hurricane_Harvey\\rasters\\raw\\'
y_path = notebook_path + '\Hurricane_Harvey\\vectors\\random-split-_2022_11_17-22_35_45\\Masks\\'

In [163]:
X_train_paths, y_train_paths, X_test_paths = get_train_test_doc_paths(X_path, y_path, notebook_path)

Number of images for training: 299
Number of images for test: 75


In [164]:
original_shape, data = get_data_as_np_array(X_train_paths, (400, 300))

In [165]:
data.shape

(299, 300, 400, 3)