# Upload/Download Data to/from Cloud Storage

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import os
from glob import glob
from typing import Dict
from zipfile import ZipFile

from azure.storage.blob import BlobServiceClient
from dotenv import find_dotenv, load_dotenv

## About

Processed data will be uploaded to or downloaded from [Azure blob storage](https://azure.microsoft.com/en-us/services/storage/blobs/).

## User Inputs

In [None]:
PROJ_ROOT_DIR = os.getcwd()

In [None]:
# Upload inputs
blob_name_suffixes = [84]

# Download inputs
dload_blob_suffix = 84
dload_fpath = (
    "data/processed/filtered_transformed_filledmissing_data__20211209_120056.zip"
)

# Action to perform
action = "download"

In [None]:
load_dotenv(find_dotenv())

conn_str = (
    "DefaultEndpointsProtocol=https;"
    f"AccountName={os.getenv('AZURE_STORAGE_ACCOUNT')};"
    f"AccountKey={os.getenv('AZURE_STORAGE_KEY')};"
    f"EndpointSuffix={os.getenv('ENDPOINT_SUFFIX')}"
)

In [None]:
def upload_az_file_blobs(blob_names_dict: Dict, conn_str: str) -> None:
    blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)
    az_container_name = os.getenv("AZURE_BLOB_CONTAINER_NAME")
    for az_blob_name, local_file_path in blob_names_dict.items():
        blob_client = blob_service_client.get_blob_client(
            container=az_container_name, blob=az_blob_name
        )
        # print(az_blob_name, local_file_path)
        if not list(
            blob_service_client.get_container_client(az_container_name).list_blobs(
                name_starts_with=az_blob_name
            )
        ):
            with open(local_file_path, "rb") as data:
                blob_client.upload_blob(data)
            print(f"Blob {az_blob_name} not found. Uploaded {local_file_path}.")
        else:
            print(f"Blob {az_blob_name} found. Did not upload {local_file_path}.")

In [None]:
def download_az_file_blobs(blob_names_dict: Dict, conn_str: str) -> None:
    downloaded_blobs = []
    blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)
    for az_blob_name, local_file_path in blob_names_dict.items():
        blob_client = blob_service_client.get_blob_client(
            container=os.getenv("AZURE_BLOB_CONTAINER_NAME"), blob=az_blob_name
        )
        # print(blob_client, local_file_path)
        downloaded_blobs.append(local_file_path)
        if not os.path.exists(local_file_path):
            with open(local_file_path, "wb") as download_file:
                download_stream = blob_client.download_blob()
                download_file.write(download_stream.readall())
            print(f"Blob {az_blob_name} not found at {local_file_path}. Downloaded.")
        else:
            print(f"Blob {az_blob_name} found at {local_file_path}. Did not download.")
        return downloaded_blobs

In [None]:
def unarchive(file_name, data_dir, search_str, flatten_filepaths=True):
    if file_name:
        if flatten_filepaths:
            file_name = file_name[0]
            # print(file_name)
        with ZipFile(file_name) as zip_ref:
            zip_ref.extractall(data_dir)
        print(
            f"Unarchived contents of {os.path.basename(file_name)} to "
            f"{os.path.split(data_dir)[-1]}"
        )
    else:
        print(f"Got empty archive name. Did not unarchive")
    return glob(os.path.join(data_dir, search_str))

In [None]:
if action == "upload":
    blob_names = {
        f"{os.getenv('BLOB_NAME_PREFIX')}{list(blob_name_suffixes)[0]}": glob(
            os.path.join("*.zip")
        )[-1]
    }
else:
    blob_names = {f"{os.getenv('BLOB_NAME_PREFIX')}{dload_blob_suffix}": dload_fpath}
print(blob_names)

## Upload to Blob Storage

Upload to blob store

In [None]:
%%time
if action == "upload":
    upload_az_file_blobs(blob_names, conn_str)

## Download from Blob Storage

In [None]:
%%time
if action == "download":
    downloaded_archives = download_az_file_blobs(blob_names, conn_str)
    print(downloaded_archives)

    unarchived_archives = [
        unarchive(
            downloaded_archive,
            data_dir="data/processed",
            search_str="filtered_transformed_filledmissing_data__*.zip",
            flatten_filepaths=False,
        )
        for downloaded_archive in downloaded_archives
    ][0]
    print(unarchived_archives)