# Deep Learning homework

## Installing dependencies

In [None]:
%pip install requests
%pip install tqdm
%pip install sklearn

## Data fetching
We use a streaming solution to fetch data, this way we are able to track progress.

In [None]:
from tqdm import tqdm
import requests
import os

def download_file(uri, target_path):
    # Create directory path to target file
    if not os.path.exists(os.path.dirname(target_path)):
        os.makedirs(os.path.dirname(target_path))

    # Download file using streaming, so we can iterate over the response
    response = requests.get(uri, stream=True)
    total_size_in_bytes= int(response.headers.get('content-length', 0)) # Total size of data to download
    block_size = 1024 # Download in chunks for progress tracking
    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) # Use a progress bar to track progress

    with open(target_path, 'wb') as file:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            file.write(data) # Write downloaded chunk to file
    progress_bar.close()

    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print(f"Error during download of {target_path}")
    else:
        print(f"Downloading {target_path} finished successfully")

First, we fetch the images from the corresponding Google Drive folder.

In [None]:
dir_path = "../data/fairface" # Path of directory to extract into

uri_images = "https://drive.google.com/uc?export=download&id=1g7qNOZz9wC7OfOhcPqH1EZ5bk1UFGmlL&confirm=t&uuid=729c215d-4fa4-4799-b03f-aea00a016230&at=ALAFpqx7EciTPuBT0YNhhbYsVpML:1666561770553"
images_file_path = "../data/fairface.zip" # Path of downloaded ZIP file

download_file(uri_images, images_file_path)

Then we fetch the CSV files containing the labels for the images.

In [None]:
uri_labels_train = "https://drive.google.com/uc?export=download&id=1i1L3Yqwaio7YSOCj7ftgk8ZZchPG7dmH"
labels_train_valid_file_path = os.path.join(dir_path, "labels_train_valid.csv") # Will be split into train and valid, so already naming it that way

download_file(uri_labels_train, labels_train_valid_file_path)

uri_labels_val = "https://drive.google.com/uc?export=download&id=1wOdja-ezstMEp81tX1a-EYkFebev4h7D"
labels_test_file_path = os.path.join(dir_path, "labels_test.csv") # Will be used as test set, so already naming it that way

download_file(uri_labels_val, labels_test_file_path)

### Data extraction
The data needs to be uncompressed. Then the labels for the training are extracted from the file names.

In [None]:
from zipfile import ZipFile

with ZipFile(images_file_path) as zip:
    zip.extractall(dir_path)

In [None]:
# Delete ZIP after extracting
os.remove(images_file_path)

Next, we read the labels into memory.

In [None]:
import numpy as np

labels_train_valid = np.loadtxt(labels_train_valid_file_path, delimiter=",", skiprows=1, dtype="str") # Read while skipping header
labels_test = np.loadtxt(labels_test_file_path, delimiter=",", skiprows=1, dtype="str")

## Data segmentation
Finally, we split the data into train, validation and test datasets for further use by our model.

Data in the downloaded dataset is already split into *train* and *val* subsets (the latter makes up about 10% of all images). Since we need to split the dataset into train, validation and test subsets, we will turn the specified *val* subset into the test subset and split the specified *train* subset into train and validation subsets.

The resulting split ratios are as follows:
+ train: ~74%
+ validation: ~15%
+ test: ~11%

In [None]:
# Turn "train" into "train_valid"
dir_train_valid_path = os.path.join(dir_path, "train_valid")
os.rename(os.path.join(dir_path, "train"), dir_train_valid_path)

# Turn "val" into "test"
dir_test_path = os.path.join(dir_path, "test")
os.rename(os.path.join(dir_path, "val"), dir_test_path)

In [None]:
from sklearn.model_selection import train_test_split

train_valid_len = len(os.listdir(dir_train_valid_path))
validation_size = (train_valid_len + len(os.listdir(dir_test_path))) * 0.15 / train_valid_len # 15% of all images -> ?% of images in "train_valid"

labels_train, labels_valid = train_test_split(labels_train_valid, test_size = validation_size, random_state = 42)

We have successfully created the three subsets, *train*, *valid* and *test*. (Note that in the file system, only test is in a separate directory, as it was in the original database that way. Separating the other subsets would be an unnecessary operation.)

In [33]:
train_images_count = len(labels_train)
valid_images_count = len(labels_valid)
test_images_count = len(labels_test)
images_count = train_images_count + valid_images_count + test_images_count

print(f"train: {train_images_count} images ({train_images_count / images_count * 100:.1f}%)")
print(f"valid: {valid_images_count} images ({valid_images_count / images_count * 100:.1f}%)")
print(f"test:  {test_images_count} images ({test_images_count / images_count * 100:.1f}%)")

train: 72089 images (73.8%)
valid: 14655 images (15.0%)
test:  10954 images (11.2%)
