# Azure Storage Data Import Pipeline Utility

This notebook processes a list of URLs via a storage queue, extracts HTML, saves to a storage blob. 

## Setup

In [None]:
import json
import time

import requests
from azure.identity import DefaultAzureCredential
from azure.storage.blob import ContainerClient
from azure.storage.queue import QueueClient, QueueMessage

In [None]:
def send_message_to_queue(
    queue_client: QueueClient, message: str, request_wait_seconds: float = 0.01
) -> bool:
    try:
        queue_client.send_message(json.dumps(message))
        return True
    except Exception as e:
        print(f"Error: Unable to send message to queue, skipped: {message}, exception: {e}")
        return False
    finally:
        time.sleep(request_wait_seconds)


def get_message_from_queue(
    queue_client: QueueClient,
    message_visibility_timeout_seconds: int,
) -> QueueMessage | None:
    # queue calls have built-in 10x retry policy
    # ref: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/storage/azure-storage-queue#optional-configuration
    try:
        queue_message = queue_client.receive_message(
            visibility_timeout=message_visibility_timeout_seconds
        )
        return queue_message
    except Exception as e:
        raise Exception("Request Error: Unable to Get Queue Message") from e


def delete_message_from_queue(queue_client: QueueClient, queue_message: QueueMessage) -> None:
    try:
        queue_client.delete_message(queue_message)
    except Exception as e:
        print(f"Unable to delete message, {queue_message}, {e}")
        raise Exception(f"Unable to delete message, {queue_message}") from e


def check_if_queue_empty_peek_message(queue_client: QueueClient) -> bool:
    try:
        peek_messages = queue_client.peek_messages()
        if not peek_messages:
            return True
        return False
    except Exception as e:
        print(f"Unable to peek at queue, {e}")
        raise Exception(f"Unable to peek at queue") from e


def upload_blob(
    container_client: ContainerClient,
    filename: str,
    data: bytes | str,
    metadata: dict = {},
    azure_storage_connection_timeout_fix_seconds: int = 600,
) -> None:
    # note: need to use undocumented param connection_timeout to avoid timeout errors
    # ref: https://stackoverflow.com/questions/65092741/solve-timeout-errors-on-file-uploads-with-new-azure-storage-blob-package
    try:
        blob_client = container_client.get_blob_client(filename)
        blob_client.upload_blob(
            data=data,
            metadata=metadata,
            connection_timeout=azure_storage_connection_timeout_fix_seconds,
            overwrite=True,
        )
        storage_account_name = container_client.account_name
        container_name = container_client.container_name
    except Exception as e:
        raise Exception(f"Unable to upload, {filename}") from e


def send_urls_to_queue(
    credential: DefaultAzureCredential,
    queue_url_and_name: str,
    queue_messages: list[dict],
) -> None:
    # queue connection
    queue_client = QueueClient.from_queue_url(queue_url_and_name, credential)
    # queue status
    print(f"Queue Status: {queue_client.get_queue_properties()}")
    # send to queue
    successful_sends = 0
    for each_msg in queue_messages:
        each_result = send_message_to_queue(queue_client, each_msg)
        if each_result:
            successful_sends += 1
    print(f"Messages Successfully Sent: {successful_sends}/{len(queue_messages)}")
    # queue status
    print(f"Queue Status: {queue_client.get_queue_properties()}")


def process_queue_message(
    queue_message: QueueMessage,
    container_client: ContainerClient,
) -> None:
    try:
        # extract content
        message_content = json.loads(queue_message.content)
        message_id = message_content["id"]
        message_url = message_content["url"]
        # download html content
        with requests.Session() as ses:
            response = ses.get(message_url)
            response.raise_for_status()
            blob_content = response.content
        # upload to blob
        blob_filename = f"{message_id}.html"
        upload_blob(
            container_client,
            filename=blob_filename,
            data=blob_content,
            metadata=message_content,
        )
    except Exception as e:
        raise Exception(f"Unable to process, {queue_message.content}") from e


def get_and_process_queue_message(
    credential: DefaultAzureCredential,
    storage_queue_url: str,
    storage_blob_url: str,
    storage_container_name: str,
    message_visibility_timeout_seconds: int,
) -> bool:
    # queue connection
    queue_client = QueueClient.from_queue_url(storage_queue_url, credential)
    # blob connection
    container_client = ContainerClient(storage_blob_url, storage_container_name, credential)
    # get message from queue
    queue_message = get_message_from_queue(queue_client, message_visibility_timeout_seconds)
    # check for message in queue
    if queue_message:
        # process message
        process_queue_message(queue_message, container_client)
        # remove message from queue if successful
        delete_message_from_queue(queue_client, queue_message)
        return True
    return False


def process_queue_loop(
    credential: DefaultAzureCredential,
    storage_queue_url: str,
    storage_blob_url: str,
    storage_container_name: str,
    message_visibility_timeout_seconds: int = 60,
    delay_between_urls_seconds: int = 1,
) -> None:
    print("Processing messages in queue...")
    while True:
        # get and process queue message
        time.sleep(delay_between_urls_seconds)
        result = get_and_process_queue_message(
            credential,
            storage_queue_url,
            storage_blob_url,
            storage_container_name,
            message_visibility_timeout_seconds,
        )
        # no messages in queue - first check
        if not result:
            print("Waiting on visibility timeout...")
            time.sleep(message_visibility_timeout_seconds + 60)
            result = get_and_process_queue_message(
                credential,
                storage_queue_url,
                storage_blob_url,
                storage_container_name,
                message_visibility_timeout_seconds,
            )
            # no messages in queue - second check
            if not result:
                print("No messages in queue")
                break

## Authentication 

In [None]:
!az login

In [None]:
# note: needs blob and queue contributor roles
credential = DefaultAzureCredential()

## Parameters 

In [None]:
storage_queue_url = "https://XXXXXXXXXXXXXXXXXXXXXXX.queue.core.windows.net/XXXXXXXXXXXXXXXXXXXXXX"
storage_blob_url = "https://XXXXXXXXXXXXXXXXXXXXXX.blob.core.windows.net/"
storage_container_name = "XXXXXXXXXXXXXXXXXXXXX"

In [None]:
data = [
    {
        "id": "111111",
        "source": "Wikipedia",
        "title": "University of Notre Dame Football",
        "url": "https://en.wikipedia.org/wiki/Notre_Dame_Fighting_Irish_football",
    },
    {
        "id": "222222",
        "source": "Wikipedia",
        "title": "Northwestern University Football",
        "url": "https://en.wikipedia.org/wiki/Northwestern_Wildcats_football",
    },
    {
        "id": "333333",
        "source": "Wikipedia",
        "title": "Notre Dame in Paris, France",
        "url": "https://en.wikipedia.org/wiki/Notre-Dame_de_Paris",
    },
    {
        "id": "444444",
        "source": "Wikipedia",
        "title": "Van Halen",
        "url": "https://en.wikipedia.org/wiki/Van_Halen",
    },
    {
        "id": "555555",
        "source": "Wikipedia",
        "title": "Python",
        "url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
    },
]

## Run

In [None]:
# send to queue
send_urls_to_queue(credential, storage_queue_url, data)

In [None]:
# process queue
process_queue_loop(
    credential,
    storage_queue_url,
    storage_blob_url,
    storage_container_name,
)