### API Requests

In [10]:
import os
import json
import requests
import boto3

from typing import List
from datetime import datetime
from dotenv import load_dotenv
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from botocore.exceptions import BotoCoreError, ClientError

# Spark
from pyspark.sql.types import StringType
from delta import configure_spark_with_delta_pip
from pyspark.sql import DataFrame, SparkSession, functions as F

### BambooHR API Client

In [70]:
# Used https://jsonplaceholder.typicode.com/ as a sample API to build the logic

In [6]:
class BambooHRClient:
    """A client for the BambooHR API.
    
    This client handles authentication, session management, and API requests
    to the BambooHR API or a placeholder API for demonstration purposes.
    """

    # BASE_URL = "https://api.bamboohr.com/api/gateway.php/{company_domain}/v1"
    BASE_URL = "https://jsonplaceholder.{company_domain}.com"
    ENDPOINTS = {
        "posts": "/posts",
        "albums": "/albums",
        "users": "/users",
    }

    def __init__(self, api_key, company_domain):
        """
        Initialize the BambooHRClient with an API key and company domain.

        Args:
            api_key (str): The API key for authenticating requests.
            company_domain (str): The domain of the company for API requests.
        """
        self.api_key = api_key
        self.company_domain = company_domain
        self.base_url = self.BASE_URL.format(company_domain=company_domain)
        self.session = self._create_session()
        self.headers = {
            "Authorization": f"Basic {self.api_key}",
            "Accept": "application/json",
        }
    
    def _create_session(self) -> requests.Session:
        """Create a session with a retry strategy for handling transient errors."""
        retry_strategy = Retry(
            total=5,
            backoff_factor=2,
            status_forcelist=[429, 503],
            allowed_methods=["GET"],
        )

        # Create an adapter with the retry strategy
        adapter = HTTPAdapter(max_retries=retry_strategy)

        # Create a session and mount the adapter
        session = requests.Session()
        session.mount("https://", adapter)

        return session

    def get(self, endpoint_key: str, params: dict = None) -> dict:
        """
        Fetch data from a specified endpoint using the endpoint key.

        Args:
            endpoint_key (str): The key for the desired endpoint (e.g., "posts").
            params (dict, optional): Query parameters to include in the request.

        Returns:
            dict: The JSON response from the API.

        Raises:
            ValueError: If an invalid endpoint key is provided.
            requests.exceptions.RequestException: If the request fails.

        Example:
            >>> client = BambooHRClient(api_key="my_api_key", company_domain="typicode")
            >>> posts = client.get("posts")
            >>> print(posts)
        """
        endpoint = self.ENDPOINTS.get(endpoint_key)
        if not endpoint:
            raise ValueError(f"Invalid endpoint key: {endpoint_key}")
        url = f"{self.base_url}{endpoint}"
        response = self.session.get(url, headers=self.headers, params=params)
        response.raise_for_status()
        return response.json()


In [7]:
API_KEY = "<your_api_key>"
COMPANY_DOMAIN = "typicode"

# Initialize the client
client = BambooHRClient(
    api_key=API_KEY, 
    company_domain=COMPANY_DOMAIN
)

posts = client.get("posts")
albums = client.get("albums")
users = client.get("users")

print(posts[:1])
print(albums[:1])
print(users[:1])

[{'userId': 1, 'id': 1, 'title': 'sunt aut facere repellat provident occaecati excepturi optio reprehenderit', 'body': 'quia et suscipit\nsuscipit recusandae consequuntur expedita et cum\nreprehenderit molestiae ut ut quas totam\nnostrum rerum est autem sunt rem eveniet architecto'}]
[{'userId': 1, 'id': 1, 'title': 'quidem molestiae enim'}]
[{'id': 1, 'name': 'Leanne Graham', 'username': 'Bret', 'email': 'Sincere@april.biz', 'address': {'street': 'Kulas Light', 'suite': 'Apt. 556', 'city': 'Gwenborough', 'zipcode': '92998-3874', 'geo': {'lat': '-37.3159', 'lng': '81.1496'}}, 'phone': '1-770-736-8031 x56442', 'website': 'hildegard.org', 'company': {'name': 'Romaguera-Crona', 'catchPhrase': 'Multi-layered client-server neural-net', 'bs': 'harness real-time e-markets'}}]


In [73]:
# Create a data directory and save the json outputs to check responses.

# os.makedirs("data", exist_ok=True)

# def save_json_data(data: dict, filename: str) -> None:
#     with open(f"data/{filename}", "w") as f:
#         json.dump(data, f, indent=4)
#     print(f"{filename} file saved successfully")

# for data, data_name in zip([posts, albums, users], ["posts", "albums", "users"]):
#     save_json_data(data, f"{data_name}.json")

### Transform and Save data in delta format

In [74]:
class DeltaFileManager:
    """Handles the creation, transformation, and storage of data in Delta format."""

    def __init__(self, app_name: str = "MyApp"):
        """Initializes the Spark session with Delta Lake configurations."""
        builder = (
            SparkSession.builder.appName(app_name)
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
            .config(
                "spark.sql.catalog.spark_catalog",
                "org.apache.spark.sql.delta.catalog.DeltaCatalog",
            )
        )
        self.spark = configure_spark_with_delta_pip(builder).getOrCreate()

    def convert_to_json_string(self, data: List[dict]) -> list:
        """
        Converts a list of dictionaries into a list of JSON strings.

        This method serializes each dictionary in the input list into a JSON-formatted
        string. The `default=str` parameter ensures that non-serializable types
        (e.g., datetime objects) are converted to strings during serialization.
        """
        return [json.dumps(record, default=str) for record in data]

    def create_schemaless_df(self, json_strings: list) -> DataFrame:
        """Converts a list of JSON strings into a schemaless DataFrame."""
        return self.spark.createDataFrame(json_strings, StringType())

    def add_timestamp_column(self, df: DataFrame) -> DataFrame:
        """Adds a processed timestamp column to a DataFrame."""
        return df.withColumn("processed_at", F.current_timestamp())

    def save_to_delta(
        self, df: DataFrame, path: str, repartition: int = 1, mode: str = "append"
    ):
        """Saves a DataFrame to a Delta table."""
        df.repartition(repartition).write.format("delta").mode(mode).save(path)

In [75]:
# Initialize the manager
manager = DeltaFileManager()

# Prepare input files into json_strings
posts_string = manager.convert_to_json_string(posts)
albums_string = manager.convert_to_json_string(albums)
users_string = manager.convert_to_json_string(users)

# Create schemaless DataFrames
posts_df = manager.create_schemaless_df(posts_string)
albums_df = manager.create_schemaless_df(albums_string)
users_df = manager.create_schemaless_df(users_string)

# Add a timestamp column
posts_df = manager.add_timestamp_column(posts_df)
albums_df = manager.add_timestamp_column(albums_df)
users_df = manager.add_timestamp_column(users_df)

# Save to Delta tables
manager.save_to_delta(posts_df, "data/delta_tables/posts", mode="overwrite")
manager.save_to_delta(albums_df, "data/delta_tables/albums", mode="overwrite")
manager.save_to_delta(users_df, "data/delta_tables/users", mode="overwrite")

                                                                                

In [76]:
# Schemas
posts_df.printSchema()
albums_df.printSchema()
users_df.printSchema()

root
 |-- value: string (nullable = true)
 |-- processed_at: timestamp (nullable = false)

root
 |-- value: string (nullable = true)
 |-- processed_at: timestamp (nullable = false)

root
 |-- value: string (nullable = true)
 |-- processed_at: timestamp (nullable = false)



In [77]:
# Display results
posts_df.show(1, truncate=False)
albums_df.show(1, truncate=False)
users_df.show(1, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
|value                                                                                                                                                                                                                                                                                     |processed_at           |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
|{"userId": 1, "id": 1, "title": "sunt aut facere repellat provident occa

### Upload JSONs outputs to S3

In [11]:
class S3Helper:
    """Utility class for handling S3 operations."""

    def __init__(
            self, 
            aws_access_key_id: str, 
            aws_secret_access_key: str, 
            region_name: str = "eu-east-1"
        ):
        """
        Initializes the S3Helper with AWS credentials and region.

        Args:
            aws_access_key_id (str): Your AWS access key ID.
            aws_secret_access_key (str): Your AWS secret access key.
            region_name (str): The AWS region to connect to (default: "us-east-1").
        """
        self.s3_client = boto3.client(
            "s3",
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            region_name=region_name,
        )

    def upload_json(self, data: List[dict], s3_path: str, bucket: str):
        """
        Upload JSON data to an S3 bucket.

        Args:
            data (List[dict]): The data to upload.
            s3_path (str): The S3 key/path where the file will be saved.
            bucket (str): The S3 bucket name.
        """
        try:
            json_data = json.dumps(data)
            self.s3_client.put_object(Bucket=bucket, Key=s3_path, Body=json_data)
            print(f"Successfully uploaded data to s3://{bucket}/{s3_path}")
        except (BotoCoreError, ClientError) as e:
            print("Failed to upload data to S3.", e)
            raise

In [12]:
# Load environment variables
load_dotenv()

# Fetch credentials
aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
aws_region = os.getenv("AWS_REGION")

# Initialize the helper
s3_helper = S3Helper(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=aws_region,
)

_date = datetime.now().strftime("%Y-%m-%d")

# Upload files
s3_helper.upload_json(
    posts, 
    bucket="landing-bucket-1cc1ed4e8908",
    s3_path=f"posts/{_date}.json"
)
s3_helper.upload_json(
    albums, 
    bucket="landing-bucket-1cc1ed4e8908",
    s3_path=f"albums/{_date}.json"
)
s3_helper.upload_json(
    users, 
    bucket="landing-bucket-1cc1ed4e8908",
    s3_path=f"users/{_date}.json"
)

Successfully uploaded data to s3://landing-bucket-1cc1ed4e8908/posts/2025-01-14.json
Successfully uploaded data to s3://landing-bucket-1cc1ed4e8908/albums/2025-01-14.json
Successfully uploaded data to s3://landing-bucket-1cc1ed4e8908/users/2025-01-14.json
