In [12]:
import os
from   concurrent.futures import ThreadPoolExecutor
import opendal

In [13]:
def upload_file(remote, root_path, file_path):
    """Upload a single file to remote storage."""
    try:
        print(f"Uploading: {root_path}/{file_path}")
        with open(os.path.join(root_path, file_path), 'rb') as local_file:
            content = local_file.read()
        remote.write(file_path, content)
        print(f"Uploaded: {file_path}")
    except Exception as e:
        print(f"Error uploading {file_path}: {e}")

def download_file(remote, root_path, file_path):
    """Download a single file from remote storage."""
    try:
        print(f"Downloading: {file_path}")
        content = remote.read(file_path)
        local_file_path = os.path.join(root_path, file_path)
        os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
        with open(local_file_path, "wb") as local_file:
            local_file.write(content)
        print(f"Downloaded: {file_path}")
    except Exception as e:
        print(f"Error downloading {file_path}: {e}")

def sync_storage_with_threading(root_path, local, remote, max_workers=4):
    """Sync files between local and remote storage using threading."""
    local_files = set([entry.path for entry in local.scan("/") if '.' in entry.path.split('/')[-1]])
    remote_files = set([entry.path for entry in remote.scan("/") if '.' in entry.path.split('/')[-1]])

    files_to_download = remote_files - local_files
    files_to_upload = local_files - remote_files

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Download files
        if files_to_download:
            print("Downloading files...")
            futures = [
                executor.submit(download_file, remote, root_path, file)
                for file in files_to_download
            ]
            for future in futures:
                future.result()  # Wait for each task to complete
        else:
            print("No files to download.")

        # Upload files
        if files_to_upload:
            print("Uploading files...")
            futures = [
                executor.submit(upload_file, remote, root_path, file)
                for file in files_to_upload
            ]
            for future in futures:
                future.result()  # Wait for each task to complete
        else:
            print("No files to upload.")


In [14]:
root_path = "/lakehouse/default/Files/tpcds/1"
local  = opendal.Operator("fs", root=root_path)
azure  = opendal.Operator("azdls", root=root_path, filesystem="aemo", endpoint=endpoint_azure, account_key= account_key )
r2     = opendal.Operator("s3", root=root_path, bucket ="aemo", region="APAC", endpoint=endpoint_r2, access_key_id=key_id, secret_access_key=secret )

In [18]:
sync_storage_with_threading(root_path, local, azure, max_workers=4)

Downloading files...
Downloading: date_dim/data_0.parquet
Downloading: catalog_sales/data_1.parquet
Downloading: household_demographics/data_0.parquet
Downloading: inventory/data_5.parquet
Downloading: time_dim/data_0.parquet
Downloading: inventory/data_0.parquet
Downloaded: catalog_sales/data_1.parquetDownloading: store_sales/data_1.parquet
Downloaded: household_demographics/data_0.parquet

Downloading: web_sales/data_4.parquet
Downloaded: date_dim/data_0.parquet
Downloading: call_center/data_0.parquet
Downloaded: inventory/data_5.parquet
Downloading: catalog_sales/data_3.parquet
Downloaded: time_dim/data_0.parquet
Downloading: customer/data_0.parquet
Downloading: customer_demographics/data_2.parquet
Downloaded: inventory/data_0.parquet
Downloading: inventory/data_1.parquet
Downloading: customer_demographics/data_0.parquet
Downloaded: call_center/data_0.parquet
Downloading: income_band/data_0.parquet
Downloaded: web_sales/data_4.parquet
Downloaded: store_sales/data_1.parquet
Downloadi

In [15]:
sync_storage_with_threading(root_path, local, r2, max_workers=4)

No files to download.
Uploading files...
Uploading: /lakehouse/default/Files/tpcds/1/date_dim/data_0.parquet
Uploading: /lakehouse/default/Files/tpcds/1/catalog_sales/data_1.parquet
Uploading: /lakehouse/default/Files/tpcds/1/household_demographics/data_0.parquet
Uploading: /lakehouse/default/Files/tpcds/1/inventory/data_5.parquet
Uploaded: household_demographics/data_0.parquet
Uploading: /lakehouse/default/Files/tpcds/1/time_dim/data_0.parquet
Uploaded: date_dim/data_0.parquet
Uploading: /lakehouse/default/Files/tpcds/1/inventory/data_0.parquet
Uploaded: catalog_sales/data_1.parquet
Uploading: /lakehouse/default/Files/tpcds/1/store_sales/data_1.parquet
Uploaded: inventory/data_5.parquetUploaded: inventory/data_0.parquet
Uploading: /lakehouse/default/Files/tpcds/1/web_sales/data_4.parquet
Uploaded: store_sales/data_1.parquet
Uploaded: time_dim/data_0.parquet
Uploading: /lakehouse/default/Files/tpcds/1/call_center/data_0.parquet
Uploading: /lakehouse/default/Files/tpcds/1/catalog_sales/