# Altyca - Utility

In [0]:
import logging
from datetime import datetime, timedelta
import sys
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
class InMemoryLogHandler(logging.Handler):
    """
    A logging handler that collects logs in memory for later bulk processing.
    """
    def __init__(self, task_name):
        super().__init__()
        self.logs = []
        self.task_name = task_name

    def emit(self, record):
        log_entry = {
            "timestamp": self.formatTime(record),
            "level": record.levelname,
            "message": record.getMessage(),
            "task": self.task_name
        }
        self.logs.append(log_entry)

    def formatTime(self, record):
        return self.formatter.formatTime(record, "%Y-%m-%d %H:%M:%S")

In [0]:
class Utility:

    ENVIRONMENTS = {
        'dev':  'adb-503590620570631.11.azuredatabricks.net',
        'sbx':  'adb-69788370609352.12.azuredatabricks.net',
        'uat':  'adb-2874730485941614.14.azuredatabricks.net',
        'prd':  'adb-392658947501145.5.azuredatabricks.net',
        'tf':   'adb-3544198369222802.2.azuredatabricks.net',
        'uc':   'adb-1870340185830510.10.azuredatabricks.net',
    }


    # Entra groups
    ADMINS =        'DTCH-A-APP-U-BI-Admin'
    ANALYSTS =      'DTCH-A-APP-U-BI-Analyst'
    DEVELOPERS =    'DTCH-A-APP-U-BI-Developer'

    # Asset Bundles Service Principals 
    DAB_SRV_DEV = 'DTCH - BI - DEV - DAB-Srv-DevOp'
    DAB_SRV_UAT = 'DTCH - BI - UAT - DAB-Srv-DevOp'
    DAB_SRV_PRD = 'DTCH - BI - PRD - DAB-Srv-DevOp'
    

    # ID' for Asset Bundles Service Principals 
    DAB_SRV_IDS = {
        'dev': '2fedf703-49dc-42e7-95f3-23ab3bd14b4a',
        'uat': '352a4a40-ef97-43bc-8c03-e81302510fca',
        'prd': '801bcf2a-4c2b-486a-b26d-cb7c150cb733'
    }

    @staticmethod
    def get_workspace_id():
        """
        Retrieves the Databricks workspace ID.

        Returns:
            str: The workspace ID.
        """
        workspace_id = dbutils.notebook.entry_point.getDbutils().notebook().getContext().workspaceId().get()
        return workspace_id
    
    @staticmethod
    def get_current_user():
        """
        Retrieves the current user.

        Returns:
            str: The current user.
        """
        current_user = spark.sql('SELECT current_user() as user').collect()[0]['user']
        return current_user
    
    @staticmethod
    def get_environment():
        """
        Retrieves the environment name based on the current Databricks workspace URL.

        Returns:
            str: The environment name if the URL matches one in the ENVIRONMENTS dictionary, otherwise, None.
        """
        url = spark.conf.get("spark.databricks.workspaceUrl")
        for env, env_url in Utility.ENVIRONMENTS.items():
            if url == env_url:
                return env
        return None
    
    @staticmethod
    def get_dab_srv_id(environment):
        """
        Retrieves the Service Principal ID for DAB Deployment based on the specified environment.

        Args:
            environment (str): The environment for which to retrieve the Service Principal ID.

        Returns:
            str: The Service Principal ID for the specified environment, or None if not found.
        """
       
        for env, id in Utility.DAB_SRV_IDS.items():
            if env == environment:
                return id
        return None
    
    @staticmethod
    def get_logger(name=__name__, level=logging.INFO, task_name=None):
        """
        Creates a logger that logs to both the notebook stdout and in-memory (for Delta write).
        """
        logger = logging.getLogger(name)

        if task_name is None:
            task_name = name

        # Clear previous handlers
        if logger.hasHandlers():
            logger.handlers.clear()

        logger.setLevel(level)
        logger.propagate = False

        # Stream Handler (stdout)
        stream_handler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')
        stream_handler.setFormatter(formatter)
        logger.addHandler(stream_handler)

        # In-memory Handler
        memory_handler = InMemoryLogHandler(task_name=task_name)
        memory_handler.setFormatter(formatter)
        logger.addHandler(memory_handler)

        # Attach to logger for access later
        logger.memory_handler = memory_handler

        return logger


    @staticmethod
    def flush_logs_to_delta(logger, pipeline, task, job_run_id, table_name):
        """
        Writes collected in-memory logs to the given Delta Table.
        """
        logs = getattr(logger, "memory_handler", None)
        if logs is None or not logs.logs:
            return

        rows = []
        for log in logs.logs:
            rows.append((
                datetime.strptime(log["timestamp"], "%Y-%m-%d %H:%M:%S"),  # assuming timestamp format is correct
                log["level"],
                log["message"],
                pipeline, 
                task,
                job_run_id
            ))

        # Updated schema to match Delta Table schema
        schema = StructType([
            StructField("timestamp", TimestampType(), True),
            StructField("level", StringType(), True),
            StructField("message", StringType(), True),
            StructField("pipeline", StringType(), True),  # "logger" field added
            StructField("task", StringType(), True),  # "task_name" instead of "task"
            StructField("job_run_id", StringType(), True) 
        ])

        # Create DataFrame with the correct schema
        df = spark.createDataFrame(rows, schema=schema)
        
        # Write to Delta Table
        df.write.format("delta").mode("append").saveAsTable(table_name)

    @staticmethod    
    def close_logger(logger):
        """
        Close all handlers attached to the logger.
        """
        for handler in logger.handlers:
            handler.close()
            logger.removeHandler(handler)

    @staticmethod
    def cast_all_ntz_timestamps_to_classic(df):
        """
        Casts all timestamp_ntz columns to classic Spark 'timestamp' type
        (UTC-based with time zone support) to avoid Delta feature errors.
        """
        for field in df.schema.fields:
            # Use simpleString() to catch timestamp_ntz
            if field.dataType.simpleString() == "timestamp_ntz":
                df = df.withColumn(field.name, col(field.name).cast("timestamp"))
        return df

In [0]:
print('class Utility sucessfully loaded')