In [0]:
# install library
# %pip install requests
# %pip install python-dotenv

In [0]:
import requests
import os
from dotenv import load_dotenv

# read data from api
def read_data(base_url, endpoint, api_key):
    # 1. api information
    api_endpoint = endpoint
    url = f"{base_url}{api_endpoint}"

    # set initial variables
    all_data = []
    page = 1
    perPage = 500
    api_key = api_key    

    # 2. get data
    while True:
        params = {
            "page" : page,
            "perPage" : perPage,
            "serviceKey" : api_key
        }
        # send api request
        response = requests.get(url, params)
        # check request status
        if response.status_code == 200:
            data = response.json()
            # using .get() in case there is no data returned. In that case, new_data will be []
            new_data = data.get("data", []) 
            # new_data is empty, escape the loop
            if len(new_data) == 0:
                break
            # add the data to the all_data
            all_data.append(new_data)
            # move to the next page
            page += 1
        # error on request -> escape the look
        else:
            print(f"error code: {response.status_code}")
            print(f"error message: {response.text}")
            # side note: if this was a ELT pipeline, unless the error is 4xx, 
            # I would have implemented retry mechanism with incremental gaps 
            # between the tries before failing the pipeline, maybe three times.
            break
    return all_data

# load environment variables from .env
load_dotenv() 
## call API_KEY saved in .env
api_key = os.getenv("API_KEY") 
base_url = "https://api.odcloud.kr/api"



In [0]:
# get a list of endpoint from a textfile
with open("resident_endpoints.txt", "r") as f:
    endpoints_to_read = [line.strip() for line in f]

total_results = {}
i = 1
# loop through each endpoint 
for ep in endpoints_to_read:
    print(f"processing.. {i}/{len(endpoints_to_read)}")
    result_data = read_data(base_url, ep, api_key)
    total_results[ep] = result_data
    i += 1
    
    
print("--- All processes finished ---")


In [0]:
# prepare data for dataframe
# I will pair endpoint and values
# currently, total_results is a dictionary with:
# key : endpoint
# value : nested list of json object lists from each page
import json

bronze_data = []
# Unnests pages and records, pairing each record with its source endpoint.
for endpoint, pages in total_results.items():
    # flatten nested records from each page in
    records = [item for page in pages for item in page]
    # same as: 
    #   for page in pages:
    #       for item in page:
    #           records.append(item)

    bronze_data.append((endpoint, json.dumps(records, ensure_ascii=False))) 
    # dumps: dump+s(string)
    # ensure_ascii= False: keep Korean words as is




In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType

spark = SparkSession.builder.appName("resident_ingest").getOrCreate()


In [0]:
apidata_schema = StructType([
    StructField("endpoint", StringType(), False),
    StructField("value", StringType(), False)
])
df = spark.createDataFrame(data = bronze_data, schema= apidata_schema)


In [0]:
df.write\
    .format("delta")\
    .mode("overwrite")\
    .saveAsTable("workspace.growth_poc.bronze_residents")

<b>Flattening the json object here vs in silver layer</b> <br>
I prefer doing it in silver layer because: <br>

<ol>
<li>Preserves the "Single Source of Truth"</li> 
The Bronze layer acts as a historical archive of the source data, exactly as it was received. This is crucial for debugging and tracking history. If I flatten the data here and later realize the business needs a field I discarded, I can't get it back without re-ingesting from the API. Also, I would have lost the historical data of it because I have been excluding it from the ingestion. With the raw JSON, I can simply adjust Silver layer logic and replay from Bronze.

<li>Resilience to Schema Changes</li> 
APIs often change. If a new nested field is added, Bronze ingestion pipeline won't break because it's just saving a string. I can then decide how to handle the new field in the Silver layer without any ingestion downtime.
</ol>