In [0]:
%run ./_udf_utils_residents

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, explode_outer, current_timestamp, sum
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, IntegerType
import pandas as pd

In [0]:
def process_data(bronze_df):
    # since the data is currently JSON string, 
    # I need to convert it back to JSON object using from_json
    resident_schema = StructType([
        StructField("국적지역", StringType(), False),
        StructField("년", IntegerType(), False),
        StructField("등록외국인 수", IntegerType(), False)  
    ])
    
    # using from_json, the column `data` (JSON string) is converted into ArrayType(StructType)
    # Each element is now a struct mapping the original JSON object's key-value pairs.
    resident_df = bronze_df.withColumn("data_parsed", from_json(col("data"), ArrayType(resident_schema)))

    # flatten the array
    exploded_df = resident_df.select(explode_outer(col("data_parsed")))

    # convert each item in struct as a column
    processed_df = exploded_df.select(
        col("col.국적지역").alias("Nationality"),
        col("col.년").alias("Year"),
        col("col.`등록외국인 수`").alias("Amount")
    )

    return processed_df


In [0]:
# 1. Get data from bronze
spark = SparkSession.builder.appName("silver_resident_longterm").getOrCreate()
bronze_df = spark.table("workspace.growth_poc.bronze_residents_longterm")

In [0]:
# 1. Get data from bronze
spark = SparkSession.builder.appName("silver_resident_longterm").getOrCreate()
bronze_df = spark.table("workspace.growth_poc.bronze_residents_longterm")

# 2. Flatten data
processed_df = process_data(bronze_df)

# 3. Clean up nationality 
cleaned_df = clean_nationality(processed_df) 
cleaned_df = cleaned_df.groupBy("Nationality", "Year").agg(
        sum("Amount").alias("Amount")
)

# 4. Apply mapping
mapped_df = map_nationality(cleaned_df)
final_df = mapped_df.withColumn("TimeStamp", current_timestamp())

final_df.write \
        .format("delta") \
        .mode("overwrite")\
        .option("mergeSchema", True)\
        .saveAsTable("workspace.growth_poc.silver_residents_longterm")