### API Requests

In [17]:
import os
import json
import requests
from typing import List
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

import pyspark
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
from delta import configure_spark_with_delta_pip

In [2]:
# Used https://jsonplaceholder.typicode.com/ as a sample API to build the logic

In [18]:
class BambooHRClient:
    """A client for the BambooHR API.
    
    This client handles authentication, session management, and API requests
    to the BambooHR API or a placeholder API for demonstration purposes.
    """

    # BASE_URL = "https://api.bamboohr.com/api/gateway.php/{company_domain}/v1"
    BASE_URL = "https://jsonplaceholder.{company_domain}.com"
    ENDPOINTS = {
        "posts": "/posts",
        "albums": "/albums",
        "users": "/users",
    }

    def __init__(self, api_key, company_domain):
        """
        Initialize the BambooHRClient with an API key and company domain.

        Args:
            api_key (str): The API key for authenticating requests.
            company_domain (str): The domain of the company for API requests.
        """
        self.api_key = api_key
        self.company_domain = company_domain
        self.base_url = self.BASE_URL.format(company_domain=company_domain)
        self.session = self._create_session()
        self.headers = {
            "Authorization": f"Basic {self.api_key}",
            "Accept": "application/json",
        }
    
    @staticmethod
    def _create_session() -> requests.Session:
        """
        Create a session with a retry strategy for handling transient errors.

        Returns:
            requests.Session: A configured session object with retry strategy.
        """
        retry_strategy = Retry(
            total=5,
            backoff_factor=2,
            status_forcelist=[429, 503],
            allowed_methods=["GET"],
        )

        # Create an adapter with the retry strategy
        adapter = HTTPAdapter(max_retries=retry_strategy)

        # Create a session and mount the adapter
        session = requests.Session()
        session.mount("https://", adapter)

        return session

    def get(self, endpoint_key: str, params: dict = None) -> dict:
        """
        Fetch data from a specified endpoint using the endpoint key.

        Args:
            endpoint_key (str): The key for the desired endpoint (e.g., "posts").
            params (dict, optional): Query parameters to include in the request.

        Returns:
            dict: The JSON response from the API.

        Raises:
            ValueError: If an invalid endpoint key is provided.
            requests.exceptions.RequestException: If the request fails.

        Example:
            >>> client = BambooHRClient(api_key="my_api_key", company_domain="typicode")
            >>> posts = client.get("posts")
            >>> print(posts)
        """
        endpoint = self.ENDPOINTS.get(endpoint_key)
        if not endpoint:
            raise ValueError(f"Invalid endpoint key: {endpoint_key}")
        url = f"{self.base_url}{endpoint}"
        response = self.session.get(url, headers=self.headers, params=params)
        response.raise_for_status()
        return response.json()


In [19]:
API_KEY = "<your_api_key>"
COMPANY_DOMAIN = "typicode"

# Initialize the client
client = BambooHRClient(
    api_key=API_KEY, 
    company_domain=COMPANY_DOMAIN
)

posts = client.get("posts")
albums = client.get("albums")
users = client.get("users")

print(posts[:1])
print("\n")
print(albums[:1])
print("\n")
print(users[:1])

[{'userId': 1, 'id': 1, 'title': 'sunt aut facere repellat provident occaecati excepturi optio reprehenderit', 'body': 'quia et suscipit\nsuscipit recusandae consequuntur expedita et cum\nreprehenderit molestiae ut ut quas totam\nnostrum rerum est autem sunt rem eveniet architecto'}]


[{'userId': 1, 'id': 1, 'title': 'quidem molestiae enim'}]


[{'id': 1, 'name': 'Leanne Graham', 'username': 'Bret', 'email': 'Sincere@april.biz', 'address': {'street': 'Kulas Light', 'suite': 'Apt. 556', 'city': 'Gwenborough', 'zipcode': '92998-3874', 'geo': {'lat': '-37.3159', 'lng': '81.1496'}}, 'phone': '1-770-736-8031 x56442', 'website': 'hildegard.org', 'company': {'name': 'Romaguera-Crona', 'catchPhrase': 'Multi-layered client-server neural-net', 'bs': 'harness real-time e-markets'}}]


In [5]:
# Create a data directory and save the json outputs
# os.makedirs("data", exist_ok=True)

# def save_json_data(data: dict, filename: str) -> None:
#     with open(f"data/{filename}", "w") as f:
#         json.dump(data, f, indent=4)
#     print(f"{filename} file saved successfully")

# for data, data_name in zip([posts, albums, users], ["posts", "albums", "users"]):
#     save_json_data(data, f"{data_name}.json")

In [6]:
def convert_to_json_string(data: List[dict]) -> list:
    """ Convert a list of dictionaries to a list of JSON strings """
    return [json.dumps(record, default=str) for record in data]

posts_string = convert_to_json_string(posts)
albums_string = convert_to_json_string(albums)
users_string = convert_to_json_string(users)

### Store delta files schemaless

In [None]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [8]:
# Creating schemaless DataFrames
posts_df = spark.createDataFrame(posts_string, StringType())
albums_df = spark.createDataFrame(albums_string, StringType())
users_df = spark.createDataFrame(users_string, StringType())

In [9]:
# TODO: Add airflow logical date column and any other necessary columns

In [10]:
def generate_timestamp(df: DataFrame) -> DataFrame:
    return df.withColumn("processed_at", F.current_timestamp())

In [11]:
posts_df = generate_timestamp(posts_df)
albums_df = generate_timestamp(albums_df)
users_df = generate_timestamp(users_df)


In [12]:
posts_df.printSchema()
albums_df.printSchema()
users_df.printSchema()

root
 |-- value: string (nullable = true)
 |-- processed_at: timestamp (nullable = false)

root
 |-- value: string (nullable = true)
 |-- processed_at: timestamp (nullable = false)

root
 |-- value: string (nullable = true)
 |-- processed_at: timestamp (nullable = false)



In [13]:
posts_df.show(3, truncate=False)
albums_df.show(3, truncate=False)
users_df.show(3, truncate=False)

                                                                                

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
|value                                                                                                                                                                                                                                                                                     |processed_at          |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
|{"userId": 1, "id": 1, "title": "sunt aut facere repellat provident occaeca

In [None]:
# Saving DataFrames
posts_df.repartition(1).write.format("delta").mode("overwrite").save("data/delta_tables/posts")
albums_df.repartition(1).write.format("delta").mode("overwrite").save("data/delta_tables/albums")
users_df.repartition(1).write.format("delta").mode("overwrite").save("data/delta_tables/users")

25/01/10 18:00:38 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [15]:
posts_df = spark.read.format("delta").load("data/delta_tables/posts")
albums_df = spark.read.format("delta").load("data/delta_tables/albums")
users_df = spark.read.format("delta").load("data/delta_tables/users")

In [16]:
posts_df.show(3, truncate=False)
albums_df.show(3, truncate=False)
users_df.show(3, truncate=False)

                                                                                

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
|value                                                                                                                                                                                                                                                                                     |processed_at           |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
|{"userId": 1, "id": 1, "title": "sunt aut facere repellat provident occa

                                                                                

+-------------------------------------------------------------------+-----------------------+
|value                                                              |processed_at           |
+-------------------------------------------------------------------+-----------------------+
|{"userId": 1, "id": 1, "title": "quidem molestiae enim"}           |2025-01-10 18:00:45.997|
|{"userId": 1, "id": 2, "title": "sunt qui excepturi placeat culpa"}|2025-01-10 18:00:45.997|
|{"userId": 1, "id": 3, "title": "omnis laborum odio"}              |2025-01-10 18:00:45.997|
+-------------------------------------------------------------------+-----------------------+
only showing top 3 rows

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------