### API Requests

In [1]:
import os
import json
import requests

from typing import List
from dotenv import load_dotenv
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# Spark
from pyspark.sql.types import StringType
from delta import configure_spark_with_delta_pip
from pyspark.sql import DataFrame, SparkSession, functions as F

In [2]:
# Load environment variables
load_dotenv()

api_key = os.getenv("API_KEY")
encryption_key = os.getenv("ENCRYPTION_KEY")

### BambooHR API Client

In [3]:
# Used https://jsonplaceholder.typicode.com/ as a sample API to build the logic

In [4]:
class BambooHRClient:
    """A client for the BambooHR API.
    
    This client handles authentication, session management, and API requests
    to the BambooHR API or a placeholder API for demonstration purposes.
    """

    # BASE_URL = "https://api.bamboohr.com/api/gateway.php/{company_domain}/v1"
    BASE_URL = "https://jsonplaceholder.{company_domain}.com"
    ENDPOINTS = {
        "posts": "/posts",
        "albums": "/albums",
        "users": "/users",
    }

    def __init__(self, api_key, company_domain):
        """
        Initialize the BambooHRClient with an API key and company domain.

        Args:
            api_key (str): The API key for authenticating requests.
            company_domain (str): The domain of the company for API requests.
        """
        self.api_key = api_key
        self.company_domain = company_domain
        self.base_url = self.BASE_URL.format(company_domain=company_domain)
        self.session = self._create_session()
        self.headers = {
            "Content-Type": "application/json",
            "accept": "application/json",
            "Authorization": f"Basic {self.api_key}",
        }
    
    def _create_session(self) -> requests.Session:
        """Create a session with a retry strategy for handling transient errors."""
        retry_strategy = Retry(
            total=5,
            backoff_factor=2,
            status_forcelist=[429, 503],
            allowed_methods=["GET"],
        )

        # Create an adapter with the retry strategy
        adapter = HTTPAdapter(max_retries=retry_strategy)

        # Create a session and mount the adapter
        session = requests.Session()
        session.mount("https://", adapter)

        return session

    def get(self, endpoint_key: str, params: dict = None) -> dict:
        """
        Fetch data from a specified endpoint using the endpoint key.

        Args:
            endpoint_key (str): The key for the desired endpoint (e.g., "posts").
            params (dict, optional): Query parameters to include in the request.

        Returns:
            dict: The JSON response from the API.

        Raises:
            ValueError: If an invalid endpoint key is provided.
            requests.exceptions.RequestException: If the request fails.

        Example:
            >>> client = BambooHRClient(api_key="my_api_key", company_domain="typicode")
            >>> posts = client.get("posts")
            >>> print(posts)
        """
        endpoint = self.ENDPOINTS.get(endpoint_key)
        if not endpoint:
            raise ValueError(f"Invalid endpoint key: {endpoint_key}")
        url = f"{self.base_url}{endpoint}"
        response = self.session.get(url, headers=self.headers, params=params)
        response.raise_for_status()
        return response.json()


In [5]:
COMPANY_DOMAIN = "typicode"

# Initialize the client
client = BambooHRClient(
    api_key=api_key, 
    company_domain=COMPANY_DOMAIN
)

posts = client.get("posts")
albums = client.get("albums")
users = client.get("users")

print(posts[:1])
print(albums[:1])
print(users[:1])

[{'userId': 1, 'id': 1, 'title': 'sunt aut facere repellat provident occaecati excepturi optio reprehenderit', 'body': 'quia et suscipit\nsuscipit recusandae consequuntur expedita et cum\nreprehenderit molestiae ut ut quas totam\nnostrum rerum est autem sunt rem eveniet architecto'}]
[{'userId': 1, 'id': 1, 'title': 'quidem molestiae enim'}]
[{'id': 1, 'name': 'Leanne Graham', 'username': 'Bret', 'email': 'Sincere@april.biz', 'address': {'street': 'Kulas Light', 'suite': 'Apt. 556', 'city': 'Gwenborough', 'zipcode': '92998-3874', 'geo': {'lat': '-37.3159', 'lng': '81.1496'}}, 'phone': '1-770-736-8031 x56442', 'website': 'hildegard.org', 'company': {'name': 'Romaguera-Crona', 'catchPhrase': 'Multi-layered client-server neural-net', 'bs': 'harness real-time e-markets'}}]


In [6]:
# Create a data directory and save the json outputs to check responses.

# os.makedirs("data", exist_ok=True)

# def save_json_data(data: dict, filename: str) -> None:
#     with open(f"data/{filename}", "w") as f:
#         json.dump(data, f, indent=4)
#     print(f"{filename} file saved successfully")

# for data, data_name in zip([posts, albums, users], ["posts", "albums", "users"]):
#     save_json_data(data, f"{data_name}.json")

### Transform and Save data in delta format

In [7]:
class DeltaFileManager:
    """Handles the creation, transformation, and storage of data in Delta format."""

    def __init__(self, app_name: str = "MyApp", encryption_key: str = None):
        """
        Initializes the Spark session with Delta Lake configurations and optionally sets the encryption key.

        Args:
            app_name (str): Name of the Spark application.
            encryption_key (str): Encryption key for AES encryption (default: None).
        """
        builder = (
            SparkSession.builder.appName(app_name)
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
            .config(
                "spark.sql.catalog.spark_catalog",
                "org.apache.spark.sql.delta.catalog.DeltaCatalog",
            )
        )
        self.spark = configure_spark_with_delta_pip(builder).getOrCreate()
        self.encryption_key = encryption_key

    def convert_to_json_string(self, data: List[dict]) -> list:
        """
        Converts a list of dictionaries into a list of JSON strings.

        This method serializes each dictionary in the input list into a JSON-formatted
        string. The `default=str` parameter ensures that non-serializable types
        (e.g., datetime objects) are converted to strings during serialization.
        """
        return [json.dumps(record, default=str) for record in data]

    def create_schemaless_df(self, json_strings: list) -> DataFrame:
        """Converts a list of JSON strings into a schemaless DataFrame."""
        return self.spark.createDataFrame(json_strings, StringType())

    def add_timestamp_column(self, df: DataFrame) -> DataFrame:
        """Adds a processed timestamp column to a DataFrame."""
        return df.withColumn("processed_at", F.current_timestamp())

    def encrypt_columns(
        self,
        df: DataFrame,
        columns: list,
        encryption_mode: str = "ECB",
    ) -> DataFrame:
        """
        Encrypts the specified columns in the DataFrame using AES encryption and Base64 encoding.

        Args:
            df (DataFrame): The input DataFrame.
            columns (list): List of column names to encrypt.
            encryption_mode (str): The encryption mode for AES (default: "ECB").

        Returns:
            DataFrame: The DataFrame with encrypted columns.
        """
        if not self.encryption_key:
            raise ValueError("Encryption key is not set. Please provide an encryption key.")

        for col_name in columns:
            encrypted_col = F.expr(f"aes_encrypt({col_name}, '{self.encryption_key}', '{encryption_mode}')")
            base64_encoded_col = F.base64(encrypted_col)
            df = df.withColumn(col_name, base64_encoded_col)
        return df

    def save_to_delta(
        self, df: DataFrame, path: str, repartition: int = 1, mode: str = "append"
    ):
        """Saves a DataFrame to a Delta table."""
        df.repartition(repartition).write.format("delta").mode(mode).save(path)

In [8]:
# Initialize the manager
manager = DeltaFileManager(encryption_key=encryption_key)

# Prepare input files into json_strings
posts_string = manager.convert_to_json_string(posts)
albums_string = manager.convert_to_json_string(albums)
users_string = manager.convert_to_json_string(users)

# Create schemaless DataFrames
posts_df = manager.create_schemaless_df(posts_string)
albums_df = manager.create_schemaless_df(albums_string)
users_df = manager.create_schemaless_df(users_string)

# Add a timestamp column
posts_df = manager.add_timestamp_column(posts_df)
albums_df = manager.add_timestamp_column(albums_df)
users_df = manager.add_timestamp_column(users_df)

# Encrypt data
posts_df = manager.encrypt_columns(posts_df, columns=["value"])
albums_df = manager.encrypt_columns(albums_df, columns=["value"])
users_df = manager.encrypt_columns(users_df, columns=["value"])

# Save to Delta tables
manager.save_to_delta(posts_df, "data/delta_tables/posts", mode="overwrite")
manager.save_to_delta(albums_df, "data/delta_tables/albums", mode="overwrite")
manager.save_to_delta(users_df, "data/delta_tables/users", mode="overwrite")

25/01/17 11:39:43 WARN Utils: Your hostname, emif-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.136 instead (on interface en0)
25/01/17 11:39:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/emif/Documents/python-tests/.venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/emif/.ivy2/cache
The jars for the packages stored in: /Users/emif/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a3b31a97-ae42-4ec0-b6e9-766d73f7bbe5;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.3.0 in central
	found io.delta#delta-storage;3.3.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 307ms :: artifacts dl 16ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.3.0 from central in [default]
	io.delta#delta-storage;3.3.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0  

In [9]:
# Schemas
posts_df.printSchema()
albums_df.printSchema()
users_df.printSchema()

root
 |-- value: string (nullable = true)
 |-- processed_at: timestamp (nullable = false)

root
 |-- value: string (nullable = true)
 |-- processed_at: timestamp (nullable = false)

root
 |-- value: string (nullable = true)
 |-- processed_at: timestamp (nullable = false)



In [10]:
# Display results
posts_df.show(1, truncate=False)
albums_df.show(1, truncate=False)
users_df.show(1, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
|value                                                                                                                                                                                                                                                                                                                                                                                                               |processed_at           |
+-----------------------------------------------------------------------------------------------------------------------------------------

### Decrypt value

In [11]:
def decrypt_columns(
        df: DataFrame, 
        columns: list, 
        encryption_key: str = encryption_key, 
        encryption_mode: str = "ECB") -> DataFrame:
    """
    Decrypts the specified columns in the DataFrame using AES decryption and Base64 decoding.

    Args:
        df (DataFrame): The input DataFrame.
        columns (list): List of column names to decrypt.
        encryption_key (str): The decryption key for AES decryption.
        encryption_mode (str): The decryption mode for AES (default: "ECB").

    Returns:
        DataFrame: The DataFrame with decrypted columns.
    """
    for col_name in columns:
        decrypted_col = F.expr(
            f"aes_decrypt(unbase64({col_name}), '{encryption_key}', '{encryption_mode}')"
        ).cast("string")
        df = df.withColumn(col_name, decrypted_col)

    return df

In [12]:
# Encrypt data
posts_df = decrypt_columns(posts_df, columns=["value"])
albums_df = decrypt_columns(albums_df, columns=["value"])
users_df = decrypt_columns(users_df, columns=["value"])

# Display results
posts_df.show(1, truncate=False)
albums_df.show(1, truncate=False)
users_df.show(1, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
|value                                                                                                                                                                                                                                                                                     |processed_at           |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
|{"userId": 1, "id": 1, "title": "sunt aut facere repellat provident occa