<a href="https://colab.research.google.com/github/emely3h/Geospatial_ML/blob/feature%2Fadd-data-generators-to-fix-ram-problem/combine_npz_files.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Combine npz files

This is a notebook for the last step in the prepare data pipeline as we did not have enough RAM to run it locally. To train the model on the entire dataset it is more convenient to have all tile-arrays of all images in one .npz file. 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#! ls
%cd drive/MyDrive/MachineLearning
%cd Geospatial_ML
! ls

/content/drive/.shortcut-targets-by-id/15HUD3sGdfvxy5Y_bjvuXgrzwxt7TzRfm/MachineLearning
/content/drive/.shortcut-targets-by-id/15HUD3sGdfvxy5Y_bjvuXgrzwxt7TzRfm/MachineLearning/Geospatial_ML
architecture.drawio  colab.py	       experiments   __pycache__
colab-new.py	     data_exploration  models	     README.md
colab_new.py	     evaluation        prepare_data  requirements.txt


In [3]:
import numpy as np
import os
import pickle
import datetime
data_path = "../data_colab/256_200"

uncompressed file is 2GB, 50MB and compressed 274 MB
=> loading/ decompressing all arrays takes ~ 15 * 2,04 GB = 31 GB
=> loading all images into RAM still works but compressing them fails

all 5 images decompressed in memory ~ 18 GB RAM

- combining 8 images with savez() takes 20 GB < 5min, < 20GB System RAM
- combining 8 images with savez_compressed() takes 1,84 GB > 10min, ~ 30 GB System RAM
- trying to combine 11 images with savez_compressed crashed during loading 9th image


In [None]:
# Problem: running out of ram when trying to save more than 5 images in one compressed npz, crashing always just at the savez_compressed() step
# => combining only 5 images into one file and then trying to combine those 2 files if possible
# => better way? Why does savez_compressed() consume most RAM?
# 50GB not enough for saving 10 images => loading + decompressing images takes ~ 30 GB why does last step, saving take so much RAM?
def combine_npz_arrays(data_path):
    count = 0
    arrays_dict = {}
    print(f'Started at: {datetime.datetime.now()}')
    for file in os.listdir(data_path):
        if file != '2022_08_09.npz' and count < 9: # Todo: find out problem with image 2022_08_09 => crashes when trying to access x_input
            print(f'Adding image {file}')
            array = np.load(f'{data_path}/{file}')
            x_input = array['x_input']
            y_mask = array['y_mask']
            if len(arrays_dict) < 1:
                arrays_dict['x_input'] = x_input
                arrays_dict['y_mask'] = y_mask
            else:
                arrays_dict['x_input'] = np.concatenate((arrays_dict['x_input'], x_input), axis=0)
                arrays_dict['y_mask'] = np.concatenate((arrays_dict['y_mask'], y_mask), axis=0)
        print(arrays_dict['x_input'].shape)
        print(arrays_dict['y_mask'].shape)
        print()
        count += 1
    # test time to execute and file size of np.savez and np.savez_compressed
    np.savez_compressed(f'{data_path}/all_images_1', **arrays_dict) #savez_compressed
    print('Combined all compressed numpy images into one single file.')
    print(f'Finished at: {datetime.datetime.now()}')

combine_npz_arrays("../data_colab/256_200")

Started at: 2023-03-28 17:01:23.368041
Adding image 2022_10_13.npz
(889, 256, 256, 5)
(889, 256, 256)

Adding image 2022_07_15.npz
(1753, 256, 256, 5)
(1753, 256, 256)

Adding image 2022_09_18.npz
(2927, 256, 256, 5)
(2927, 256, 256)

Adding image 2022_06_20.npz
(4178, 256, 256, 5)
(4178, 256, 256)

Adding image 2022_10_23.npz
(5342, 256, 256, 5)
(5342, 256, 256)

Adding image 2022_07_25.npz
(6600, 256, 256, 5)
(6600, 256, 256)

Adding image 2022_08_04.npz
(7919, 256, 256, 5)
(7919, 256, 256)

Adding image 2022_07_10.npz
(9242, 256, 256, 5)
(9242, 256, 256)

Adding image 2022_07_30.npz


In [7]:
array = np.load(f'{data_path}/2022_06_20.npz')
x_input = array['x_input']
y_mask = array['y_mask']
print(x_input.shape)
print(y_mask.shape)
print(array.files)

(1251, 256, 256, 5)
(1251, 256, 256)
['y_mask', 'x_input']


In [None]:
data_path = "../data_colab/256_200"

total_tiles = 0
for file in os.listdir(data_path):
  if not os.path.isdir(os.path.join(data_path, file)):
   
    print(f'Image: {file}')
    array = np.load(f'{data_path}/{file}')
    total_tiles += array['x_input'].shape[0]
    print(array['x_input'].shape)
    print(array['y_mask'].shape)
    print()

print(f'Total amount of tiles {total_tiles}')

Image: 2022_10_13.npz
(889, 256, 256, 5)
(889, 256, 256)

Image: 2022_07_15.npz
(864, 256, 256, 5)
(864, 256, 256)

Image: 2022_09_18.npz
(1174, 256, 256, 5)
(1174, 256, 256)

Image: 2022_06_20.npz
(1251, 256, 256, 5)
(1251, 256, 256)

Image: 2022_10_23.npz
(1164, 256, 256, 5)
(1164, 256, 256)

Image: 2022_07_25.npz
(1258, 256, 256, 5)
(1258, 256, 256)

Image: 2022_08_04.npz
(1319, 256, 256, 5)
(1319, 256, 256)

Image: 2022_07_10.npz
(1323, 256, 256, 5)
(1323, 256, 256)

Image: 2022_07_30.npz
(1183, 256, 256, 5)
(1183, 256, 256)

Image: 2022_08_14.npz
(1179, 256, 256, 5)
(1179, 256, 256)

Image: 2022_08_24.npz
(1306, 256, 256, 5)
(1306, 256, 256)

Image: 2022_09_03.npz
(1196, 256, 256, 5)
(1196, 256, 256)

Image: 2022_12_12.npz
(957, 256, 256, 5)
(957, 256, 256)

Image: 2022_09_08.npz
(927, 256, 256, 5)
(927, 256, 256)

Image: 2022_12_02.npz
(1142, 256, 256, 5)
(1142, 256, 256)

Image: 2022_09_13.npz
(1175, 256, 256, 5)
(1175, 256, 256)

Image: 2022_08_09.npz
(1181, 256, 256, 5)
(1181,

In [24]:
# Determine the shape of the output array
output_shape = (2574, 256, 256)

# Create a memory-mapped array to hold the output data
output_file = np.memmap(os.path.join(data_path, "combined_y_mask.npy"), mode="w+", shape=output_shape, dtype=np.float32)
file_count = 0
# Iterate over each compressed numpy array
for file in os.listdir(data_path):
  if not os.path.isdir(os.path.join(data_path, file)) and not file.startswith('combined') and not file.startswith('compressed'):
    file_count += 1
    print(f'loading file {file_count}: {file}')
    # Load the compressed numpy array in chunks using np.memmap
    with np.load(os.path.join(data_path, file), mmap_mode="r") as data:
        chunk_size = 50  # Number of samples to load per chunk
        num_chunks = data["y_mask"].shape[0] // chunk_size
        for j in range(num_chunks):
            print(f'Chunk {j}')
            start_idx = (file_count - 1) * num_chunks * chunk_size + j * chunk_size
            end_idx = start_idx + chunk_size
            # Write the chunk to the output file using the memory-mapped array
            print(f'output file indexes: {start_idx} : {end_idx}  Chunk shape {data["y_mask"][j * chunk_size:(j + 1) * chunk_size, ...].shape}')
            output_file[start_idx:end_idx, ...] = data["y_mask"][j * chunk_size:(j + 1) * chunk_size, ...]
print('finished concatenating arrays')
output_file.flush()
print('finished flushing')

# Problem: file not saved in drive...? => takes time to sync

loading file 1: 2022_10_13.npz
Chunk 0
output file indexes: 0 : 50  Chunk shape (50, 256, 256)
Chunk 1
output file indexes: 50 : 100  Chunk shape (50, 256, 256)
Chunk 2
output file indexes: 100 : 150  Chunk shape (50, 256, 256)
Chunk 3
output file indexes: 150 : 200  Chunk shape (50, 256, 256)
Chunk 4
output file indexes: 200 : 250  Chunk shape (50, 256, 256)
Chunk 5
output file indexes: 250 : 300  Chunk shape (50, 256, 256)
Chunk 6
output file indexes: 300 : 350  Chunk shape (50, 256, 256)
Chunk 7
output file indexes: 350 : 400  Chunk shape (50, 256, 256)
Chunk 8
output file indexes: 400 : 450  Chunk shape (50, 256, 256)
Chunk 9
output file indexes: 450 : 500  Chunk shape (50, 256, 256)
Chunk 10
output file indexes: 500 : 550  Chunk shape (50, 256, 256)
Chunk 11
output file indexes: 550 : 600  Chunk shape (50, 256, 256)
Chunk 12
output file indexes: 600 : 650  Chunk shape (50, 256, 256)
Chunk 13
output file indexes: 650 : 700  Chunk shape (50, 256, 256)
Chunk 14
output file indexes: 7

In [31]:
np.savez_compressed(os.path.join(data_path, "compressed_combined_y_mask.npz"), x_input=output_file)

# Delete the memory-mapped array to free up resources
del output_file

In [25]:
array = np.load(f'{data_path}/compressed_combined_x_input.npz', allow_pickle=True)
x_input = array['x_input']
print(array.files)
#y_mask = array['y_mask']
print(x_input.shape)
#print(y_mask.shape)

['x_input']
(19488, 256, 256, 5)


In [26]:
array = np.load(f'{data_path}/combined_x_input.npy')
x_input = array['x_input']
y_mask = array['y_mask']
print(x_input.shape)
print(y_mask.shape)

ValueError: ignored

In [34]:
array_y.files
array_y['x_input'].shape

(2574, 256, 256)

time to execute npy: 50 min

system ram needed: ~10 GB

crashed on last img 2022_08_09: ValueError                                Traceback (most recent call last)

<ipython-input-19-2ced60a8ab90> in <module>
     19             end_idx = start_idx + chunk_size
     20             # Write the chunk to the output file using the memory-mapped array
---> 21             output_file[start_idx:end_idx, ...] = data["x_input"][j * chunk_size:(j + 1) * chunk_size, ...]
     22 
     23 # Delete the memory-mapped array to free up resources

ValueError: could not broadcast input array from shape (50,256,256,5) into shape (38,256,256,5)



file size npz
time to execute npz

=> saved as npy can not be read but when npy is then saved as compressed npz readin/ loading works, takes just more time but files are smaller