### API Requests

In [16]:
import os
import json
import requests
from typing import List
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

import pyspark
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
from delta import configure_spark_with_delta_pip

In [2]:
class Endpoints:
    """ Class to define the endpoints for the BambooHR API """
    # BASE_URL = "https://api.bamboohr.com/api/gateway.php/{company_domain}/v1"
    BASE_URL = "https://jsonplaceholder.{company_domain}.com"
    
    @staticmethod
    def posts():
        return "/posts"

    @staticmethod
    def albums():
        return "/albums"

    @staticmethod
    def users():
        return "/users"

In [3]:
class BambooHRClient:
    """ A client for the BambooHR API """
    def __init__(self, api_key, company_domain):
        """ Initialize the BambooHRClient with an API key and company domain 

        Args:
            api_key (str): The API key for the BambooHR API
            company_domain (str): The domain for the company
        """
        self.api_key = api_key
        self.company_domain = company_domain
        self.base_url = Endpoints.BASE_URL.format(company_domain=company_domain)
        self.session = self._create_session()
        self.headers = {
            "Authorization": f"Basic {self.api_key}",
            "Accept": "application/json",
        }
    
    @staticmethod
    def _create_session() -> requests.Session:
        """ Create a session with a retry strategy """
        retry_strategy = Retry(
            total=5,
            backoff_factor=2,
            status_forcelist=[429, 503],
            allowed_methods=["GET"],
        )

        # Create an adapter with the retry strategy
        adapter = HTTPAdapter(max_retries=retry_strategy)

        # Create a session and mount the adapter
        session = requests.Session()
        session.mount("https://", adapter)

        return session

    def get(self, endpoint=str, params=None) -> dict:
        """ Make a GET request to the BambooHR API

        Args:
            endpoint (str): The endpoint to make the request to
            params (dict): The query parameters for the request
        """
        url = f"{self.base_url}{endpoint}"
        response = self.session.get(url, headers=self.headers, params=params)
        response.raise_for_status()
        return response.json()

    def get_posts(self) -> dict:
        return self.get(Endpoints.posts())

    def get_albums(self) -> dict:
        return self.get(Endpoints.albums())

    def get_users(self) -> dict:
        return self.get(Endpoints.users())

In [31]:
API_KEY = "<your_api_key>"
COMPANY_DOMAIN = "typicode"

# Initialize the client
client = BambooHRClient(
    api_key=API_KEY, 
    company_domain=COMPANY_DOMAIN
)

posts = client.get_posts()
albums = client.get_albums()
users = client.get_users()

print(posts[:1])
print("\n")
print(albums[:1])
print("\n")
print(users[:1])

[{'userId': 1, 'id': 1, 'title': 'sunt aut facere repellat provident occaecati excepturi optio reprehenderit', 'body': 'quia et suscipit\nsuscipit recusandae consequuntur expedita et cum\nreprehenderit molestiae ut ut quas totam\nnostrum rerum est autem sunt rem eveniet architecto'}]


[{'userId': 1, 'id': 1, 'title': 'quidem molestiae enim'}]


[{'id': 1, 'name': 'Leanne Graham', 'username': 'Bret', 'email': 'Sincere@april.biz', 'address': {'street': 'Kulas Light', 'suite': 'Apt. 556', 'city': 'Gwenborough', 'zipcode': '92998-3874', 'geo': {'lat': '-37.3159', 'lng': '81.1496'}}, 'phone': '1-770-736-8031 x56442', 'website': 'hildegard.org', 'company': {'name': 'Romaguera-Crona', 'catchPhrase': 'Multi-layered client-server neural-net', 'bs': 'harness real-time e-markets'}}]


In [None]:
# Create a data directory and save the json outputs
# os.makedirs("data", exist_ok=True)

# def save_json_data(data: dict, filename: str) -> None:
#     with open(f"data/{filename}", "w") as f:
#         json.dump(data, f, indent=4)
#     print(f"{filename} file saved successfully")

# for data, data_name in zip([posts, albums, users], ["posts", "albums", "users"]):
#     save_json_data(data, f"{data_name}.json")

In [6]:
def convert_to_json_string(data: List[dict]) -> list:
    """ Convert a list of dictionaries to a list of JSON strings """
    return [json.dumps(record, default=str) for record in data]

posts_string = convert_to_json_string(posts)
albums_string = convert_to_json_string(albums)
users_string = convert_to_json_string(users)

### Store delta files schemaless

In [7]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()


25/01/10 15:28:11 WARN Utils: Your hostname, emif-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.136 instead (on interface en0)
25/01/10 15:28:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/emif/Documents/python-tests/.venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/emif/.ivy2/cache
The jars for the packages stored in: /Users/emif/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-72a1dbf5-dd29-45f3-bb27-2087a88d7084;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.3.0 in central
	found io.delta#delta-storage;3.3.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 321ms :: artifacts dl 17ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.3.0 from central in [default]
	io.delta#delta-storage;3.3.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0  

In [22]:
# Creating schemaless DataFrames
posts_df = spark.createDataFrame(posts_string, StringType())
albums_df = spark.createDataFrame(albums_string, StringType())
users_df = spark.createDataFrame(users_string, StringType())

In [26]:
# TODO: Add airflow logical date column and any other necessary columns

In [24]:
def generate_timestamp(df: DataFrame) -> DataFrame:
    return df.withColumn("processed_at", F.current_timestamp())

In [25]:
posts_df = generate_timestamp(posts_df)
albums_df = generate_timestamp(albums_df)
users_df = generate_timestamp(users_df)


In [32]:
posts_df.printSchema()
albums_df.printSchema()
users_df.printSchema()

root
 |-- value: string (nullable = true)
 |-- processed_at: timestamp (nullable = true)

root
 |-- value: string (nullable = true)
 |-- processed_at: timestamp (nullable = true)

root
 |-- value: string (nullable = true)
 |-- processed_at: timestamp (nullable = true)



In [27]:
posts_df.show(3, truncate=False)
albums_df.show(3, truncate=False)
users_df.show(3, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
|value                                                                                                                                                                                                                                                                                     |processed_at           |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
|{"userId": 1, "id": 1, "title": "sunt aut facere repellat provident occa

In [28]:
# Storing DataFrame as Delta format
posts_df.repartition(1).write.format("delta").mode("overwrite").save("data/delta_tables/posts")
albums_df.repartition(1).write.format("delta").mode("overwrite").save("data/delta_tables/albums")
users_df.repartition(1).write.format("delta").mode("overwrite").save("data/delta_tables/users")

                                                                                

In [29]:
posts_df = spark.read.format("delta").load("data/delta_tables/posts")
albums_df = spark.read.format("delta").load("data/delta_tables/albums")
users_df = spark.read.format("delta").load("data/delta_tables/users")

In [30]:
posts_df.show(3, truncate=False)
albums_df.show(3, truncate=False)
users_df.show(3, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
|value                                                                                                                                                                                                                                                                                     |processed_at           |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
|{"userId": 1, "id": 1, "title": "sunt aut facere repellat provident occa