In [1]:
import requests
import json
import boto3

from awsglue.context import GlueContext
from pyspark.context import SparkContext

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.8 
Trying to create a Glue session for the kernel.
Session Type: glueetl
Session ID: 3e5226b5-f563-416e-9275-ca58525c18f2
Applying the following default arguments:
--glue_kernel_version 1.0.8
--enable-glue-datacatalog true
Waiting for session 3e5226b5-f563-416e-9275-ca58525c18f2 to get into ready status...
Session 3e5226b5-f563-416e-9275-ca58525c18f2 has been created.



In [2]:
# Initialize Glue and Spark
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session




In [3]:
# --- Call the API ---
dataset_id = "d_23f946fa557947f93a8043bbef41dd09"
url = f"https://data.gov.sg/api/action/datastore_search?resource_id={dataset_id}"
response = requests.get(url)
records = response.json()["result"]["records"]




In [4]:
# --- Convert JSON to Spark DataFrame ---
df = spark.read.json(sc.parallelize([json.dumps(records)]))




In [5]:
# --- Save to temporary S3 folder ---
bucket = "hdb-carpark-info"
temp_prefix = "temp-output/"
final_key = "HDBCarparkInformation.csv"

temp_s3_path = f"s3://{bucket}/{temp_prefix}"
df.coalesce(1).write.mode("overwrite").option("header", True).csv(temp_s3_path)




In [6]:
# --- Rename part file to desired CSV name ---
s3 = boto3.client("s3")
response = s3.list_objects_v2(Bucket=bucket, Prefix=temp_prefix)

# Find part-*.csv file
part_file_key = next(obj["Key"] for obj in response["Contents"] if obj["Key"].endswith(".csv"))

# Copy it to final destination
s3.copy_object(
    Bucket=bucket,
    CopySource={"Bucket": bucket, "Key": part_file_key},
    Key=final_key
)

# (Optional) Clean up temp files
for obj in response["Contents"]:
    s3.delete_object(Bucket=bucket, Key=obj["Key"])

{'ResponseMetadata': {'RequestId': '0TQ06MB6X41C6S0F', 'HostId': '12mgTSzt1wQb0Uwkj/kIzUzIrNbbKZkrza+/UqdkQ8vvSE26fgdQEGIzNc+u8XN0DXblLyR+amQ=', 'HTTPStatusCode': 204, 'HTTPHeaders': {'x-amz-id-2': '12mgTSzt1wQb0Uwkj/kIzUzIrNbbKZkrza+/UqdkQ8vvSE26fgdQEGIzNc+u8XN0DXblLyR+amQ=', 'x-amz-request-id': '0TQ06MB6X41C6S0F', 'date': 'Wed, 09 Apr 2025 15:41:24 GMT', 'server': 'AmazonS3'}, 'RetryAttempts': 0}}


In [7]:
print(f"✅ CSV saved as s3://{bucket}/{final_key}")

✅ CSV saved as s3://hdb-carpark-info/HDBCarparkInformation.csv
