In [1]:
def get_optimal_chunk_size(shape, expected_memory_usage, client):
    # Get scheduler information to retrieve worker details
    scheduler_info = client.scheduler_info()
    workers = scheduler_info['workers']
    num_workers = len(workers)

    # Initialize variables for worker resources
    total_memory = 0

    for worker, details in workers.items():
        memory_limit = details['memory_limit']
        total_memory += memory_limit

    # Memory per worker (convert to GB)
    memory_per_worker_gb = (total_memory / num_workers) / 1e9

    print(f"Total Workers: {num_workers}")
    print(f"Memory per Worker: {memory_per_worker_gb:.2f} GB")

    # Predicted memory usage is in KB, so convert it to GB
    expected_memory_usage_gb = expected_memory_usage / (1024 ** 2)

    # Calculate the optimal chunk size
    # If memory usage exceeds the per-worker memory, we need to reduce the chunk size
    if expected_memory_usage_gb > memory_per_worker_gb:
        print("Expected memory usage exceeds memory per worker. Reducing chunk size.")
        chunk_size_ratio = memory_per_worker_gb / expected_memory_usage_gb
        chunk_size = tuple(int(dim * chunk_size_ratio) for dim in shape)
    else:
        # If memory usage is within limits, use full shape as chunk
        chunk_size = shape

    print(f"Optimal Chunk Size: {chunk_size}")
    return chunk_size

Basedo na heurística acima, temos 2 cenários:
1. Quando o consumo de memória esperado é superior a memória disponível por worker
2. Quando o consumo de memória esperado é inferior a memória disponível por worker

No caso 1, estamos hoje encontrando a razão entre a memória disponível por worker e a memória esperada, e usando isso como fator para definir o chunk_size. Desse modo, nós geramos a menor quantidade possível de chunks de modo que eles caibam na memória dos workers

No caso 2, nós basicamente estamos usando o tamanho do dado como chunk_size

In [2]:
import os
import sys

helpers_path = os.path.abspath('../libs/helpers')
traceq_path = os.path.abspath('../libs/traceq')

helpers_path not in sys.path and sys.path.append(helpers_path)
traceq_path not in sys.path and sys.path.append(traceq_path)

print(sys.path)

['/home/delucca/.pyenv/versions/3.10.14/lib/python310.zip', '/home/delucca/.pyenv/versions/3.10.14/lib/python3.10', '/home/delucca/.pyenv/versions/3.10.14/lib/python3.10/lib-dynload', '', '/home/delucca/.pyenv/versions/3.10.14/envs/dask-auto-chunking/lib/python3.10/site-packages', '/home/delucca/src/unicamp/msc/dask-auto-chunking/libs/helpers', '/home/delucca/src/unicamp/msc/dask-auto-chunking/libs/traceq']


In [3]:
import uuid
import os

from datetime import datetime

EXPERIMENT_ID = f'009-{datetime.now().strftime("%Y%m%d%H%M%S")}-{uuid.uuid4().hex[:6]}'
OUTPUT_DIR = f'./output/{EXPERIMENT_ID}'

os.makedirs(OUTPUT_DIR)

OUTPUT_DIR

'./output/009-20241008165725-edc3ae'

In [4]:
import dask

from bokeh.io import output_notebook

# Ensure Bokeh works properly in Jupyter
output_notebook()

# Disable GPU diagnostics in Dask
dask.config.set({"distributed.diagnostics.nvml": False})

<dask.config.set at 0x7f27801b7280>

In [5]:

from helpers.dask_operators import envelope_from_ndarray


def run_envelope(chunk_size='auto', n_workers=1, n_threads=1, max_memory=16):
    client = Client(n_workers=n_workers, threads_per_worker=n_threads, memory_limit=f'{max_memory / n_workers}GB')

    # Use Dask Profiler to monitor resource usage
    resource_profiler = ResourceProfiler()

    with resource_profiler:
        start_time = time.time()
        try:
            synthetic_data = load_segy(synthetic_data_path)
            print("Data shape: ", synthetic_data.shape)

            X = da.from_array(synthetic_data, chunks=chunk_size)
            print("Chunks: ", X.chunks)
            print("Number of chunks along each axis:", [len(c) for c in X.chunks])

            result = envelope_from_ndarray(X)
        finally:
            end_time = time.time()
            client.close()

    execution_time = end_time - start_time
    print(f"Execution time: {execution_time:.2f} seconds")

    resource_visualization = resource_profiler.visualize()
    display(resource_visualization)

## Caso 2

In [6]:
from helpers.datasets import generate_seismic_data

DATA_OUTPUT_DIR = f'{OUTPUT_DIR}/experiment'
synthetic_data_path = generate_seismic_data(
    inlines=300,
    xlines=300,
    samples=300,
    output_dir=DATA_OUTPUT_DIR,
)

2024-10-08 16:57:26 - generate-seismic-data - INFO - Generating synthetic data for shape (300, 300, 300)


### 1 worker

In [7]:
# Caso base (auto-chunking)
run_envelope()

Data shape:  (300, 300, 300)
Chunks:  ((300,), (300,), (300,))
Number of chunks along each axis: [1, 1, 1]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 0.81 seconds


In [8]:
# 1 único chunk

run_envelope((300, 300, 300))

Data shape:  (300, 300, 300)
Chunks:  ((300,), (300,), (300,))
Number of chunks along each axis: [1, 1, 1]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 0.76 seconds


In [9]:
# 3 chunks

run_envelope((100, 100, 100))

Data shape:  (300, 300, 300)
Chunks:  ((100, 100, 100), (100, 100, 100), (100, 100, 100))
Number of chunks along each axis: [3, 3, 3]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 1.08 seconds


In [10]:
# 6 chunks

run_envelope((50, 50, 50))

Data shape:  (300, 300, 300)
Chunks:  ((50, 50, 50, 50, 50, 50), (50, 50, 50, 50, 50, 50), (50, 50, 50, 50, 50, 50))
Number of chunks along each axis: [6, 6, 6]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 2.30 seconds


### Mútiplos workers

In [11]:
n_workers = 3
n_threads = 3

# ~5GB por worker

In [12]:
# Caso base (auto-chunking)

run_envelope(n_workers=n_workers, n_threads=n_threads)

Data shape:  (300, 300, 300)
Chunks:  ((300,), (300,), (300,))
Number of chunks along each axis: [1, 1, 1]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 0.78 seconds


In [13]:
# 1 único chunk

run_envelope(chunk_size=(300, 300, 300), n_workers=n_workers, n_threads=n_threads)

Data shape:  (300, 300, 300)
Chunks:  ((300,), (300,), (300,))
Number of chunks along each axis: [1, 1, 1]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 0.75 seconds


In [14]:
# n_chunks = n_workers (3)

run_envelope(chunk_size=(100, 100, 100), n_workers=n_workers, n_threads=n_threads)

Data shape:  (300, 300, 300)
Chunks:  ((100, 100, 100), (100, 100, 100), (100, 100, 100))
Number of chunks along each axis: [3, 3, 3]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 0.88 seconds


In [15]:
# n_chunks > n_workers

run_envelope(chunk_size=(60, 60, 60), n_workers=n_workers, n_threads=n_threads)

Data shape:  (300, 300, 300)
Chunks:  ((60, 60, 60, 60, 60), (60, 60, 60, 60, 60), (60, 60, 60, 60, 60))
Number of chunks along each axis: [5, 5, 5]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 1.02 seconds


In [16]:
# n_chunks > n_workers (way larger)

run_envelope(chunk_size=(30, 30, 30), n_workers=n_workers, n_threads=n_threads)

Data shape:  (300, 300, 300)
Chunks:  ((30, 30, 30, 30, 30, 30, 30, 30, 30, 30), (30, 30, 30, 30, 30, 30, 30, 30, 30, 30), (30, 30, 30, 30, 30, 30, 30, 30, 30, 30))
Number of chunks along each axis: [10, 10, 10]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 2.88 seconds


Quanto mais chunks, mais lento fica. Pode ser que isso aconteça por conta da simplicidade do operador. Vamos tentar com algo mais complexo

### GST3D

In [17]:
import time
import dask.array as da
from dask.diagnostics import ResourceProfiler
from dask.distributed import Client
from helpers.dask_operators import gradient_structure_tensor_from_dask_array, load_segy


def run_gst3d(chunk_size='auto', n_workers=1, n_threads=1, max_memory=16):
    client = Client(n_workers=n_workers, threads_per_worker=n_threads, memory_limit=f'{max_memory / n_workers}GB')

    # Use Dask Profiler to monitor resource usage
    resource_profiler = ResourceProfiler()

    with resource_profiler:
        start_time = time.time()
        try:
            synthetic_data = load_segy(synthetic_data_path)
            print("Data shape: ", synthetic_data.shape)

            X = da.from_array(synthetic_data, chunks=chunk_size)
            print("Chunks: ", X.chunks)
            print("Number of chunks along each axis:", [len(c) for c in X.chunks])

            result = gradient_structure_tensor_from_dask_array(X)
        finally:
            end_time = time.time()
            client.close()

    execution_time = end_time - start_time
    print(f"Execution time: {execution_time:.2f} seconds")

    resource_visualization = resource_profiler.visualize()
    display(resource_visualization)

In [19]:
# Caso base (auto-chunking)

run_gst3d(n_workers=n_workers, n_threads=n_threads, max_memory=32)

Data shape:  (300, 300, 300)
Chunks:  ((300,), (300,), (300,))
Number of chunks along each axis: [1, 1, 1]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 14.68 seconds


In [20]:
# chunk_size < n_workers

run_gst3d(chunk_size=(150, 150, 150), n_workers=n_workers, n_threads=n_threads, max_memory=32)

Data shape:  (300, 300, 300)
Chunks:  ((150, 150), (150, 150), (150, 150))
Number of chunks along each axis: [2, 2, 2]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 5.37 seconds


In [21]:
# chunk_size = n_workers

run_gst3d(chunk_size=(100, 100, 100), n_workers=n_workers, n_threads=n_threads, max_memory=32)

Data shape:  (300, 300, 300)
Chunks:  ((100, 100, 100), (100, 100, 100), (100, 100, 100))
Number of chunks along each axis: [3, 3, 3]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 7.35 seconds


In [22]:
# chunk_size > n_workers

run_gst3d(chunk_size=(50, 50, 50), n_workers=n_workers, n_threads=n_threads, max_memory=32)

Data shape:  (300, 300, 300)
Chunks:  ((50, 50, 50, 50, 50, 50), (50, 50, 50, 50, 50, 50), (50, 50, 50, 50, 50, 50))
Number of chunks along each axis: [6, 6, 6]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 30.04 seconds


In [23]:
# chunk_size > n_workers (a lot)

run_gst3d(chunk_size=(30, 30, 30), n_workers=n_workers, n_threads=n_threads, max_memory=32)

Data shape:  (300, 300, 300)
Chunks:  ((30, 30, 30, 30, 30, 30, 30, 30, 30, 30), (30, 30, 30, 30, 30, 30, 30, 30, 30, 30), (30, 30, 30, 30, 30, 30, 30, 30, 30, 30))
Number of chunks along each axis: [10, 10, 10]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 135.62 seconds


Aparentemente, um pouco de paralelismo é positivo, mas não podemos chegar a ter um chunk por worker

## Caso 1

In [24]:
n_workers = 10
n_threads = 10

# ~3GB por worker

In [25]:
# Caso base (auto-chunking)

run_gst3d(n_workers=n_workers, n_threads=n_threads, max_memory=32)

Data shape:  (300, 300, 300)
Chunks:  ((300,), (300,), (300,))
Number of chunks along each axis: [1, 1, 1]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
2024-10-08 17:06:07,436 - distributed.scheduler - ERROR - Task ('compute_3d_dip-6840eb8334a2f75450dedb0dda82d0ac', 0, 0, 0) marked as failed because 4 workers died while trying to run it


KilledWorker: Attempted to run task ('compute_3d_dip-6840eb8334a2f75450dedb0dda82d0ac', 0, 0, 0) on 4 different workers, but all those workers died while running it. The last worker that attempt to run the task was tcp://127.0.0.1:44617. Inspecting worker logs is often a good next step to diagnose what went wrong. For more information see https://distributed.dask.org/en/stable/killed.html.

In [26]:
# chunk_size < n_workers (a lot)

run_gst3d(chunk_size=(150, 150, 150), n_workers=n_workers, n_threads=n_threads, max_memory=32)

Data shape:  (300, 300, 300)
Chunks:  ((150, 150), (150, 150), (150, 150))
Number of chunks along each axis: [2, 2, 2]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 5.74 seconds


In [27]:
# chunk_size < n_workers

run_gst3d(chunk_size=(50, 50, 50), n_workers=n_workers, n_threads=n_threads, max_memory=32)

Data shape:  (300, 300, 300)
Chunks:  ((50, 50, 50, 50, 50, 50), (50, 50, 50, 50, 50, 50), (50, 50, 50, 50, 50, 50))
Number of chunks along each axis: [6, 6, 6]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 26.05 seconds


In [28]:
# chunk_size = n_workers

run_gst3d(chunk_size=(30, 30, 30), n_workers=n_workers, n_threads=n_threads, max_memory=32)

Data shape:  (300, 300, 300)
Chunks:  ((30, 30, 30, 30, 30, 30, 30, 30, 30, 30), (30, 30, 30, 30, 30, 30, 30, 30, 30, 30), (30, 30, 30, 30, 30, 30, 30, 30, 30, 30))
Number of chunks along each axis: [10, 10, 10]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 114.18 seconds


In [29]:
# chunk_size > n_workers

run_gst3d(chunk_size=(15, 15, 15), n_workers=n_workers, n_threads=n_threads, max_memory=32)

Data shape:  (300, 300, 300)
Chunks:  ((15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), (15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), (15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15))
Number of chunks along each axis: [20, 20, 20]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Execution time: 954.76 seconds


In [None]:
# chunk_size > n_workers (a lot)

run_gst3d(chunk_size=(10, 10, 10), n_workers=n_workers, n_threads=n_threads, max_memory=32)

Data shape:  (300, 300, 300)
Chunks:  ((10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10), (10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10), (10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10))
Number of chunks along each axis: [30, 30, 30]


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


Com base nisso, acredito que a melhor estratégia é definir a menor quantidade de chunks possível