# Azure Storage and Event Hubs 

In [None]:
import pandas as pd
import time
import json
import uuid

## Azure Authentication

In [None]:
# install azure cli tools: https://learn.microsoft.com/en-us/cli/azure/install-azure-cli
# install python packages: pip install azure-cli azure-identity azure-storage-blob azure-eventhub

In [None]:
# service principal: https://learn.microsoft.com/en-us/azure/developer/python/sdk/authentication-local-development-service-principal
# import os
# os.environ["AZURE_CLIENT_ID"] = "YOUR_CLIENT_ID"
# os.environ["AZURE_TENANT_ID"] = "YOUR_TENANT_ID"
# os.environ["AZURE_CLIENT_SECRET"] = "YOUR_CLIENT_SECRET"

In [None]:
# alternative to service principal: interactive web login via cli
!az login

In [None]:
# set subscription
# !az account show/list/set
from azure.cli.core import get_default_cli
get_default_cli().invoke(["account", "show"])

In [None]:
# azure credential object 
from azure.identity import DefaultAzureCredential
credential = DefaultAzureCredential()

## Pull Data from Azure Storage 

In [None]:
# note: in azure portal, add 'storage blob contributor' role
STORAGE_ACCOUNT = "XXXXXXXXXXXXXX"
CONTAINER_NAME = "XXXXXXXXXXXXXX"
FILE_NAME = "XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX.csv"
storage_url = f"https://{STORAGE_ACCOUNT}.dfs.core.windows.net/"

In [None]:
# pull blob
from azure.storage.blob import BlobServiceClient
blob_service_client = BlobServiceClient(account_url=storage_url, credential=credential)
blob_client = blob_service_client.get_blob_client(container=CONTAINER_NAME, blob=FILE_NAME)

In [None]:
# save file locally
with open("temp.csv", "wb") as my_blob:
    blob_data = blob_client.download_blob()
    blob_data.readinto(my_blob)

In [None]:
# convert to dataframe
df = pd.read_csv("temp.csv")
df

## Azure Event Hub Producer

In [None]:
# get connection string via azure portal -> eventhubs namespace -> shared access policies -> add
# note: use asyncio patterns for production: https://pypi.org/project/azure-eventhub
# samples: https://learn.microsoft.com/en-us/samples/azure/azure-sdk-for-python/eventhub-samples/
CONNECTION_STRING = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
EVENTHUB_NAME = "XXXXXXXXXXX"

In [None]:
from azure.eventhub import EventHubProducerClient, EventData

# produce events and send to event hub
with EventHubProducerClient.from_connection_string(CONNECTION_STRING, eventhub_name=EVENTHUB_NAME) as producer:
    
    # Create a batch of events to send
    event_data_batch = producer.create_batch()
    
    # add messages to batch
    # note: batches are limited to 256k (basic) to 1MB (standard), will throw error
    for each_row in df.to_dict(orient="records"):
        each_json_message = json.dumps(each_row)
        event_data_batch.add(EventData(json.dumps(each_json_message)))
        
    # send
    try:
        producer.send_batch(event_data_batch)
        print(f"sent sucessfully {time.ctime()}")
    except Exception as e:
        print(f"failed {time.ctime()}, exception: {e}")

## Azure Event Hub Consumer

In [None]:
# get connection string via azure portal -> eventhubs namespace -> shared access policies -> add
CONNECTION_STRING = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
EVENTHUB_NAME = "XXXXXXXXXXXX"

In [None]:
from azure.eventhub import EventHubConsumerClient

# callback function to process events
def on_event_function(partition_context, events):
    
    # save events to jsonl file
    with open("data.jsonl", "a") as f:
        for each_event in events:
            each_event_body_json = each_event.body_as_json()
            f.write(f"{each_event_body_json}\n")
            
    # Update the checkpoint
    partition_context.update_checkpoint()

# receive events 
try:
    with EventHubConsumerClient.from_connection_string(CONNECTION_STRING, "$Default", eventhub_name=EVENTHUB_NAME) as consumer:
        consumer.receive_batch(
            on_event_batch=on_event_function, 
            starting_position="-1"
        )
except KeyboardInterrupt:
    print('Stopped receiving')

## Save Data to Azure Storage 

In [None]:
# read jsonl file
with open("data.jsonl", "r") as f:
    # Read each line and parse it as a JSON object
    data = [json.loads(line) for line in f]
    raw_df = pd.DataFrame(data)

In [None]:
# transformations 
temp_df = raw_df.drop_duplicates().sort_values(by="timestamp")
temp_df["column_6"] =  temp_df["column_3"] + temp_df["column_4"]
filtered_df = temp_df[temp_df["column_6"] > 1]
filtered_df

In [None]:
# convert to csv
output_csv = filtered_df.to_csv(index=False)

In [None]:
# note: in azure portal, add 'storage blob contributor' role
STORAGE_ACCOUNT = "XXXXXXXXXXXXXXXX"
CONTAINER_NAME = "XXXXXXXXXXXXXXX"
export_file_name = f"{uuid.uuid1()}.csv"
container_url = f"https://{STORAGE_ACCOUNT}.blob.core.windows.net/"

In [None]:
# upload blob
from azure.storage.blob import ContainerClient
container_client = ContainerClient(account_url=container_url, container_name=CONTAINER_NAME, credential=credential)
blob_client = container_client.get_blob_client(export_file_name)
blob_client.upload_blob(data=output_csv)