# Azure Data Explorer Kusto Query Queue 

In [None]:
from azure.identity import DefaultAzureCredential
from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
from azure.kusto.data.helpers import dataframe_from_result_table
from typing import List, Dict

import pandas as pd
import logging 
import time
import os

class KustoQueryQueue:
    """
    Class that implements an ADX Kusto Query Queue for executing async management commands across a datetime range 
    Submits up to the concurrent limit and checks status until complete 
    """
    def __init__(
        self, 
        start_datetime: str,
        end_datetime: str,
        query_template: str,
        kusto_client: KustoClient, 
        kusto_database: str, 
        max_concurrent_queries: int = 10, 
        time_between_requests_sec:int = 1,
        time_between_loops_sec:int = 60,
    ):
        """
        Args:
            start_datetime: time range start, example: "2007-01-01 00:00:00"
            end_datetime: time range end, example: "2007-01-02 00:00:00"
            query_template: kql query or management command to be used 
            kusto_cleint: azure kusto cluster client object 
            kusto_database: name of data explorer database
            max_concurrent_queries: max number of concurrent queries to 
            time_between_requests_sec: time between adx requests
            time_between_loops_sec: time between loops when all queries submitted or at limit
        Returns:
            KustoQueryQueue object 
        """
        self.start_datetime = start_datetime
        self.end_datetime = end_datetime
        self.query_template = query_template
        self.kusto_client = kusto_client
        self.kusto_database = kusto_database
        self.max_concurrent_queries = max_concurrent_queries
        self.time_between_requests_sec = time_between_requests_sec
        self.time_between_loops_sec = time_between_loops_sec
        self.query_datetime_ranges = self.get_datetime_ranges()
        self.in_progress_queries = []
        self.completed_queries = []
        self.failed_queries = []
        self.operation_id_lookup = {}
    
    def __str__(self) -> str:
        """
        Print method
        Returns:
            string
        """
        output_str = f"Query Template: {self.query_template}\n"
        output_str += f"Start Time: {self.start_datetime}\n"
        output_str += f"End Time: {self.end_datetime}\n"
        output_str += f"Queue: {self.query_datetime_ranges}\n"
        output_str += f"Completed Queries: {self.completed_queries}\n" 
        output_str += f"Failed Queries: {self.failed_queries}"
        return output_str
    
    def get_datetime_ranges(self) -> List[List[str]]:
        """
        Converts start and end dates to time range pairs by frequency
        Returns:
            list of lists of start and end timestamps 
        """
        time_ranges = pd.date_range(self.start_datetime, self.end_datetime, freq='1h')
        start_and_end_datetimes = []
        for each_index, each_time in enumerate(time_ranges):
            if each_index < len(time_ranges)-1:
                start_and_end_datetimes.append([str(each_time), str(time_ranges[each_index+1])])
        return start_and_end_datetimes
        
    def generate_query_from_template(self, start_datetime:str, end_datetime:str) -> str:
        """
        Generates query using template and provided datetimes
        Args:
            start_datetime: time range start, example: "2007-01-01 00:00:00"
            end_date: time range end, example: "2007-01-02 00:00:00"
        Returns:
            query text string
        """
        query = self.query_template 
        query_replace = query.replace("<START_DATETIME_STRING>", start_datetime)
        query_replace = query_replace.replace("<END_DATETIME_STRING>", end_datetime)
        return query_replace 
    
    def get_queue_status(self) -> Dict:
        """
        Get status of the query queue
        Returns:
            Dictionary 
        """
        queue_status = {
            "Queue" : len(self.query_datetime_ranges),
            "In Progress" : len(self.in_progress_queries),
            "Completed" : len(self.completed_queries),
            "Failed" : len(self.failed_queries)
        }
        return queue_status
    
    def submit_query(self, query: str) -> str:
        """
        Submits a query 
        Args:
            query
        Returns:
            operation_id
        """
        response = kusto_client.execute(self.kusto_database, query)
        df = dataframe_from_result_table(response.primary_results[0])
        operation_id = df.iloc[0]["OperationId"]
        return operation_id
    
    def get_query_info(self, operation_id: str) -> Dict:
        """
        Get query info from operation_id
        Args:
            operation_id
        Returns:
            dictionary
        """
        query = f".show operations {operation_id}"
        response = kusto_client.execute(self.kusto_database, query)
        query_info = dataframe_from_result_table(response.primary_results[0]).to_dict(orient="records")[0]
        return query_info
    
    def run(self): 
        """
        Runs kusto query queue:
        1. checks status of in-progress queries
        2. submits query if less than concurrent max limit
        3. loops until all queries completed or failed
        Returns:
            "Run Complete" string 
        """
        # logging 
        logging.basicConfig(
            filename=f"./kqq-{pd.Timestamp.today().value}.log",
            format='%(asctime)s %(levelname)s %(message)s',
            filemode='w'
        )
        logger = logging.getLogger()
        logger.setLevel(logging.INFO)

        # run while queryor query in progress lists aren't empty
        while (len(self.query_datetime_ranges) > 0) or (len(self.in_progress_queries) > 0):
            # queue status
            logger.info(self.get_queue_status())
            # check query status
            for each_operation_id in self.in_progress_queries:
                try:
                    each_query_info = self.get_query_info(each_operation_id)
                    each_query_info["QueryDatetimeRange"] = self.operation_id_lookup[each_operation_id]
                    each_query_state = each_query_info["State"]
                    if each_query_state == "Completed":
                        self.in_progress_queries.remove(each_operation_id)
                        self.completed_queries.append(each_query_info)
                        logger.info(each_query_info)
                    elif each_query_state in ["Throttled", "Failed"]:
                        self.in_progress_queries.remove(each_operation_id)
                        self.failed_queries.append(each_query_info)
                        logger.error(failed_query)
                except Exception as e:
                    logger.error(f"Unable to Get Query Info for {each_operation_id}, Exception: {e}")
                time.sleep(self.time_between_requests_sec)
                
            # submit query if query list isn't empty and lower than concurrent query limit
            if (len(self.query_datetime_ranges) != 0) and (len(self.in_progress_queries) < self.max_concurrent_queries):
                datetime_range = self.query_datetime_ranges.pop()
                start_datetime, end_datetime = datetime_range
                try:
                    query = self.generate_query_from_template(start_datetime, end_datetime)
                    operation_id = self.submit_query(query)
                    self.in_progress_queries.append(operation_id)
                    self.operation_id_lookup.update({operation_id : f"{start_datetime} -> {end_datetime}"})
                    # unsucessful submit, add back to queue
                except Exception as e:
                    self.query_datetime_ranges.insert(0, datetime_range)
                    logger.error(f"Unable to Submit Query: {start_datetime} -> {end_datetime}, Exception: {e}")
                time.sleep(self.time_between_requests_sec)
            else:
                # wait time when at concurrent query limit 
                time.sleep(self.time_between_loops_sec)
            
        # display final status
        logger.info(self.get_queue_status())
        return "Run Complete"

## Authenticate with Azure and ADX

In [None]:
# service principal auth
# docs: https://learn.microsoft.com/en-us/azure/developer/python/sdk/authentication-local-development-service-principal?tabs=azure-portal
# 1. azure portal -> app registrations -> client_id and tenant_id
# 2. add certificate & secrets -> new client secret -> secret value (only shown once)
# 3. *may need to add roles (i.e. storage blob contrib) for some applications 
os.environ["AZURE_CLIENT_ID"] = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
os.environ["AZURE_TENANT_ID"] = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
os.environ["AZURE_CLIENT_SECRET"] = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
credential = DefaultAzureCredential()
                            
# adx connection
kusto_uri = "https://XXXXXXXXXXXXXXXXXXXXXX.eastus.kusto.windows.net"
kcsb = KustoConnectionStringBuilder.with_az_cli_authentication(kusto_uri)
kusto_client = KustoClient(kcsb)
kusto_database = "XXXXXXXXXXXXXXXXXX"

## Inputs

In [None]:
# inputs
start_datetime = "2007-01-01 00:00:00"
end_datetime = "2007-01-01 10:00:00"
query_template = f"""
    .export  
        async 
        to csv ( h@"https://XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" )
        with ( includeHeaders="all" )
    <|
    set truncationmaxsize = 1000000000; 
    let start_time = "<START_DATETIME_STRING>";
    let end_time = "<END_DATETIME_STRING>";
    sampledata 
    | where StartTime between (todatetime(start_time) .. todatetime(end_time))"""

## Run

In [None]:
test_query_queue = KustoQueryQueue(
    start_datetime, 
    end_datetime, 
    query_template,
    kusto_client, 
    kusto_database
)

In [None]:
test_query_queue.run()