### API Requests

In [36]:
import os
import json
import base64
import requests

from typing import List
from dotenv import load_dotenv
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from datetime import datetime, timedelta, timezone

# Spark
from pyspark.sql.types import StringType
from delta import configure_spark_with_delta_pip
from pyspark.sql import DataFrame, SparkSession, functions as F

In [37]:
# Load environment variables
load_dotenv()

api_key = os.getenv("API_KEY")
encryption_key = os.getenv("ENCRYPTION_KEY")

### BambooHR API Client

In [49]:
class BambooHRClient:
    """A client for the BambooHR API.
    This client handles authentication, session management, and API requests
    to the BambooHR API or a placeholder API for demonstration purposes.
    """

    DOMAIN = "muttclip"
    BASE_URL = f"https://api.bamboohr.com/api/gateway.php/{DOMAIN}/v1"
    ENDPOINTS = {
        "company_information": "/company_information",
        "employees": "/employees/directory",
        "employees_changed": "/employees/changed",
        "employees_supervisors": "/reports/custom",
        "account_info_fields": "/meta/fields",
        "account_info_tab_fields": "/meta/tables",
        "account_list_fields": "/meta/lists",
        "account_list_users": "/meta/users",
        "time_off_types": "/meta/time_off/types",
        "time_off_policies": "/meta/time_off/policies",
        "time_off_requests": "/time_off/requests",
        "time_off_whos_out": "/time_off/whos_out",
        "custom_report_test": "/reports/101",
    }

    def __init__(self, api_key):
        """Initialize the BambooHRClient with an API key."""
        self.api_key = api_key
        self.base_url = self.BASE_URL
        self.session = self._create_session()
        self.headers = {
            "Content-Type": "application/json",
            "accept": "application/json",
            "Authorization": self._encode_auth_header(api_key),
        }

    def _encode_auth_header(self, api_key: str) -> str:
        """Encode the API key in Base64 for the Authorization header."""
        auth_string = f"{api_key}:x"
        encoded_bytes = base64.b64encode(auth_string.encode("utf-8"))
        return f"Basic {encoded_bytes.decode('utf-8')}"

    def _build_query_string(self, params: dict) -> str:
        """Helper method to construct a query string from a dictionary."""
        return "&".join(f"{k}={v}" for k, v in params.items())

    def _create_session(self) -> requests.Session:
        """Create a session with a retry strategy for handling transient errors."""
        retry_strategy = Retry(
            total=5,
            backoff_factor=2,
            status_forcelist=[429, 503],
            allowed_methods=["GET", "POST"],
        )

        # Create an adapter with the retry strategy
        adapter = HTTPAdapter(max_retries=retry_strategy)

        # Create a session and mount the adapter
        session = requests.Session()
        session.mount("https://", adapter)

        return session

    def get(self, endpoint_key: str, params: dict = None) -> dict:
        """
        Fetch data from a specified endpoint using the endpoint key.

        Args:
            endpoint_key (str): The key for the desired endpoint (e.g., "posts").
            params (dict, optional): Query parameters to include in the request.

        Returns:
            dict: The JSON response from the API.

        Raises:
            ValueError: If an invalid endpoint key is provided.
            requests.exceptions.RequestException: If the request fails.
        """
        endpoint = self.ENDPOINTS.get(endpoint_key)
        if not endpoint:
            raise ValueError(f"Invalid endpoint key: {endpoint_key}")
        url = f"{self.base_url}{endpoint}"
        response = self.session.get(url, headers=self.headers, params=params)
        try:
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Request failed with status {response.status_code}: {e}")
            raise
        
        return response.json()
    
    def get_employees_changed(self, days_offset: int = 1, change_type: str = None) -> dict:
        """
        Fetch the list of employees changed since a given number of days ago.

        Args:
            days_offset (int, optional): The number of days prior to the current date 
                to use as the starting point for fetching changes. Defaults to 1.
            change_type (str, optional): Type of change to filter for ("inserted", "updated", "deleted").
                If not provided, all change types will be included.

        Returns:
            dict: The JSON response from the API, containing the list of employees 
                who have changed since the specified timestamp.

        """
        since = datetime.now(timezone.utc) - timedelta(days=days_offset)
        since = since.strftime("%Y-%m-%dT%H:%M:%SZ")
        
        params = {"since": since}
        if change_type:
            params["type"] = change_type

        return self.get("employees_changed", params=params)

    def get_time_off_request(self, days_offset: int = 1) -> dict:
        """
        Fetch time-off requests within a specified date range.

        Args:
            days_offset (int, optional): The number of days prior to the current date 
                to use as the start of the date range. Defaults to 1.

        Returns:
            dict: The JSON response from the API, containing the list of time-off 
                requests within the specified date range.
        """
        start_dt = datetime.now(timezone.utc) - timedelta(days=days_offset)
        end_dt = datetime.now(timezone.utc)
        start_dt = start_dt.strftime("%Y-%m-%d")
        end_dt = end_dt.strftime("%Y-%m-%d")
        params = {
            "start": start_dt,
            "end": end_dt,
        }

        return self.get("time_off_requests", params=params)

    def post(self, endpoint_key: str, data: dict = None, query_params: dict = None) -> dict:
        """
        Send data to a specified endpoint using the POST method.

        Args:
            endpoint_key (str): The key for the desired endpoint.
            data (dict, optional): The JSON payload to send in the POST request.
            query_params (dict, optional): Query parameters to include in the URL.

        Returns:
            dict: The JSON response from the API.

        Raises:
            ValueError: If an invalid endpoint key is provided.
            requests.exceptions.RequestException: If the request fails.
        """
        endpoint = self.ENDPOINTS.get(endpoint_key)
        if not endpoint:
            raise ValueError(f"Invalid endpoint key: {endpoint_key}")
        
        if query_params:
            query_string = self._build_query_string(query_params)
            url = f"{self.base_url}{endpoint}?{query_string}"
        else:
            url = f"{self.base_url}{endpoint}"
        
        response = self.session.post(url, headers=self.headers, json=data)
        try:
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Request failed with status {response.status_code}: {e}")
            raise

        return response.json()

    def create_employees_supervisors(self) -> dict:
        """
        Generate a custom report of employees and their supervisors.

        This method creates a custom report that includes fields for employees' first
        names, last names, supervisors, and supervisor IDs.

        Returns:
            dict: The JSON response from the API with the generated report.
        """
        payload= {
            "title": "employees-supervisors",
            "fields": ["firstName", "lastName", "supervisor", "supervisorEid"]
        }
        q_params={"format": "json", "onlyCurrent": "true"}
    
        return self.post(
            "employees_supervisors", 
            data=payload,
            query_params=q_params
        )

In [51]:
# Initialize the client
client = BambooHRClient(api_key=api_key)

# Create a data directory to store the JSON outputs
os.makedirs("data", exist_ok=True)

def save_json_data(data: dict, filename: str) -> None:
    """Save JSON data to a file."""
    with open(f"data/{filename}", "w") as f:
        json.dump(data, f, indent=4)
    print(f"{filename} file saved successfully")

In [53]:
# Map report methods or endpoint keys to filenames
report_mappings = {
    "company_information": "company_information.json",
    "employees": "employees.json",
    "employees_changed": lambda: client.get_employees_changed(days_offset=7),
    "employees_supervisors": lambda: client.create_employees_supervisors(),
    "account_info_fields": "account_info_fields.json",
    "account_info_tab_fields": "account_info_tab_fields.json",
    "account_list_fields": "account_list_fields.json",
    "account_list_users": "account_list_users.json",
    "time_off_types": "time_off_types.json",
    "time_off_policies": "time_off_policies.json",
    "time_off_requests": lambda: client.get_time_off_request(days_offset=7),
    "time_off_whos_out": "time_off_whos_out.json",
    "custom_report_test": "custom_report_test.json",
}

# Fetch and save reports dynamically
for report_key, filename_or_callable in report_mappings.items():
    if callable(filename_or_callable):
        data = filename_or_callable()
        filename = f"{report_key}.json"
    else:
        data = client.get(report_key)
        filename = filename_or_callable
    save_json_data(data, filename)

company_information.json file saved successfully
employees.json file saved successfully
employees_changed.json file saved successfully
employees_supervisors.json file saved successfully
account_info_fields.json file saved successfully
account_info_tab_fields.json file saved successfully
account_list_fields.json file saved successfully
account_list_users.json file saved successfully
time_off_types.json file saved successfully
time_off_policies.json file saved successfully
time_off_requests.json file saved successfully
time_off_whos_out.json file saved successfully
custom_report_test.json file saved successfully


### Transform and Save data in delta format

In [7]:
# class DeltaFileManager:
#     """Handles the creation, transformation, and storage of data in Delta format."""

#     def __init__(self, app_name: str = "MyApp", encryption_key: str = None):
#         """
#         Initializes the Spark session with Delta Lake configurations and optionally sets the encryption key.

#         Args:
#             app_name (str): Name of the Spark application.
#             encryption_key (str): Encryption key for AES encryption (default: None).
#         """
#         builder = (
#             SparkSession.builder.appName(app_name)
#             .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
#             .config(
#                 "spark.sql.catalog.spark_catalog",
#                 "org.apache.spark.sql.delta.catalog.DeltaCatalog",
#             )
#         )
#         self.spark = configure_spark_with_delta_pip(builder).getOrCreate()
#         self.encryption_key = encryption_key

#     def convert_to_json_string(self, data: List[dict]) -> list:
#         """
#         Converts a list of dictionaries into a list of JSON strings.

#         This method serializes each dictionary in the input list into a JSON-formatted
#         string. The `default=str` parameter ensures that non-serializable types
#         (e.g., datetime objects) are converted to strings during serialization.
#         """
#         return [json.dumps(record, default=str) for record in data]

#     def create_schemaless_df(self, json_strings: list) -> DataFrame:
#         """Converts a list of JSON strings into a schemaless DataFrame."""
#         return self.spark.createDataFrame(json_strings, StringType())

#     def add_processed_dt(self, df: DataFrame) -> DataFrame:
#         """Adds a processed timestamp column to a DataFrame."""
#         return df.withColumn("processed_at", F.current_timestamp())

#     def encrypt_columns(
#         self,
#         df: DataFrame,
#         columns: list,
#         encryption_mode: str = "ECB",
#     ) -> DataFrame:
#         """
#         Encrypts the specified columns in the DataFrame using AES encryption and Base64 encoding.

#         Args:
#             df (DataFrame): The input DataFrame.
#             columns (list): List of column names to encrypt.
#             encryption_mode (str): The encryption mode for AES (default: "ECB").

#         Returns:
#             DataFrame: The DataFrame with encrypted columns.
#         """
#         if not self.encryption_key:
#             raise ValueError("Encryption key is not set. Please provide an encryption key.")

#         for col_name in columns:
#             encrypted_col = F.expr(f"aes_encrypt({col_name}, '{self.encryption_key}', '{encryption_mode}')")
#             base64_encoded_col = F.base64(encrypted_col)
#             df = df.withColumn(col_name, base64_encoded_col)
#         return df

#     def save_to_delta(
#         self, df: DataFrame, path: str, repartition: int = 1, mode: str = "append"
#     ):
#         """Saves a DataFrame to a Delta table."""
#         df.repartition(repartition).write.format("delta").mode(mode).save(path)

In [None]:
# # Initialize the manager
# manager = DeltaFileManager(encryption_key=encryption_key)

# # Prepare input files into json_strings
# posts_string = manager.convert_to_json_string(posts)
# albums_string = manager.convert_to_json_string(albums)
# users_string = manager.convert_to_json_string(users)

# # Create schemaless DataFrames
# posts_df = manager.create_schemaless_df(posts_string)
# albums_df = manager.create_schemaless_df(albums_string)
# users_df = manager.create_schemaless_df(users_string)

# # Add a timestamp column
# posts_df = manager.add_processed_dt(posts_df)
# albums_df = manager.add_processed_dt(albums_df)
# users_df = manager.add_processed_dt(users_df)

# # Encrypt data
# posts_df = manager.encrypt_columns(posts_df, columns=["value"])
# albums_df = manager.encrypt_columns(albums_df, columns=["value"])
# users_df = manager.encrypt_columns(users_df, columns=["value"])

# # Save to Delta tables
# manager.save_to_delta(posts_df, "data/delta_tables/posts", mode="overwrite")
# manager.save_to_delta(albums_df, "data/delta_tables/albums", mode="overwrite")
# manager.save_to_delta(users_df, "data/delta_tables/users", mode="overwrite")

In [None]:
# # Schemas
# posts_df.printSchema()
# albums_df.printSchema()
# users_df.printSchema()

In [None]:
# # Display results
# posts_df.show(1, truncate=False)
# albums_df.show(1, truncate=False)
# users_df.show(1, truncate=False)

### Decrypt value

In [11]:
# def decrypt_columns(
#         df: DataFrame, 
#         columns: list, 
#         encryption_key: str = encryption_key, 
#         encryption_mode: str = "ECB") -> DataFrame:
#     """
#     Decrypts the specified columns in the DataFrame using AES decryption and Base64 decoding.

#     Args:
#         df (DataFrame): The input DataFrame.
#         columns (list): List of column names to decrypt.
#         encryption_key (str): The decryption key for AES decryption.
#         encryption_mode (str): The decryption mode for AES (default: "ECB").

#     Returns:
#         DataFrame: The DataFrame with decrypted columns.
#     """
#     for col_name in columns:
#         decrypted_col = F.expr(
#             f"aes_decrypt(unbase64({col_name}), '{encryption_key}', '{encryption_mode}')"
#         ).cast("string")
#         df = df.withColumn(col_name, decrypted_col)

#     return df

In [None]:
# # Encrypt data
# posts_df = decrypt_columns(posts_df, columns=["value"])
# albums_df = decrypt_columns(albums_df, columns=["value"])
# users_df = decrypt_columns(users_df, columns=["value"])

# # Display results
# posts_df.show(1, truncate=False)
# albums_df.show(1, truncate=False)
# users_df.show(1, truncate=False)