# Deep Learning homework

## Installing dependencies

In [None]:
%pip install requests
%pip install tqdm

## Data fetching
First, we fetch the images from the corresponding Google Drive folder.

In [None]:
uri = "https://drive.google.com/uc?export=download&id=1g7qNOZz9wC7OfOhcPqH1EZ5bk1UFGmlL&confirm=t&uuid=729c215d-4fa4-4799-b03f-aea00a016230&at=ALAFpqx7EciTPuBT0YNhhbYsVpML:1666561770553"
file_path = "../data/fairface.zip" # Path of downloaded ZIP file

from tqdm import tqdm
import requests

# Download file using streaming, so we can iterate over the response
response = requests.get(uri, stream=True)
total_size_in_bytes= int(response.headers.get('content-length', 0)) # Total size of data to download
block_size = 1024 # Download in chunks for progress tracking
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) # Use a progress bar to track progress

with open(file_path, 'wb') as file:
    for data in response.iter_content(block_size):
        progress_bar.update(len(data))
        file.write(data) # Write downloaded chunk to file
progress_bar.close()

if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
    print("Error during download")
else:
    print("Download finished successfully")

## Data extraction
The data needs to be uncompressed. Then the labels for the training are extracted from the file names.

In [None]:
from zipfile import ZipFile

dir_path = "../data/fairface" # Path of directory to extract into

with ZipFile(file_path) as zip:
    zip.extractall(dir_path)

In [None]:
# Delete ZIP after extracting
import os

os.remove(file_path)

## Data segmentation
Finally, we split the data into train, validation and test datasets for further use by our model.

Data in the downloaded dataset is already split into *train* and *val* subsets (the latter makes up about 10% of all images). Since we need to split the dataset into train, validation and test subsets, we will turn the specified *val* subset into the test subset and split the specified *train* subset into train and validation subsets.

The resulting split ratios are as follows:
+ train: ~74%
+ validation: ~15%
+ test: ~11%

In [None]:
# Turn "val" into "test"
os.rename(os.path.join(dir_path, "val"), os.path.join(dir_path, "test"))

# Turn "train" into "train_valid"
os.rename(os.path.join(dir_path, "train"), os.path.join(dir_path, "train_valid"))