In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, explode_outer, when, lit, sum
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, IntegerType
import pandas as pd

In [0]:
# 1. Get data from bronze
spark = SparkSession.builder.appName("silver_resident_shortterm").getOrCreate()
bronze_df = spark.table("workspace.growth_poc.bronze_residents_shortterm")

# 2. Cleaning up
cleaned_df = clean_data(bronze_df)

# 3. Apply mapping
mapped_df = map_nationality(cleaned_df)

mapped_df.write.format("delta").mode("overwrite").saveAsTable("workspace.growth_poc.silver_residents_shortterm")

In [0]:
def clean_data(bronze_df):
    # since the data is currently JSON string, 
    # I need to convert it back to JSON object using from_json
    resident_schema = StructType([
        StructField("국적지역", StringType(), False),
        StructField("년", IntegerType(), False),
        StructField("단기체류외국인 수", IntegerType(), False)  
    ])
    
    # using from_json, the column `data` (JSON string) is converted into ArrayType(StructType)
    # Each element is now a struct mapping the original JSON object's key-value pairs.
    resident_df = bronze_df.withColumn("data_parsed", from_json(col("data"), ArrayType(resident_schema)))

    # flatten the array
    exploded_df = resident_df.select(explode_outer(col("data_parsed")))

    # convert each item in struct as a column
    processed_df = exploded_df.select(
        col("col.국적지역").alias("Nationality"),
        col("col.년").alias("Year"),
        col("col.`단기체류외국인 수`").alias("Amount")
    )

    # replace vague nationality with None
    exclude_list = [
        "기타", "국제연합", "미등록국가", "교황청", "무국적",
        "영국속국민","영국속령지시민","영국외지민","영국외지시민","영국해외영토시민"
    ]

    # clean up nationalities
    replacement_dict = {
        # 러시아 관련
        "러시아(연방)": "러시아",
        "러시아연방": "러시아",
        "한국계러시아인": "러시아",
        
        # 남수단
        "남수단공화국": "남수단",
        
        # 그루지야 / 조지아
        "그루지야": "조지아",
        
        # 프랑스
        "불령가이아나": "프랑스",
        "프랑스령 가이아나": "프랑스",

        # 아르메니아
        "아르메": "아르메니아",
        
        # 세르비아몬테네그로 → 세르비아  
        "세르비아몬테네그로": "세르비아",
        
        # 러시아/중국/홍콩 관련
        "홍콩거주난민": "홍콩",
        "한국계중국인": "중국",
        
        # 예시: 긴 명칭 간소화
        "남아프리카공화국": "남아프리카",
        "도미니카공화국": "도미니카",
        "도미니카연방": "도미니카",
        "티모르민주공화국": "티모르",
        
        # 기타 소형 섬 / 특수 영토 간소화
        "미국인근섬": "미국",
        "마르티니크": "프랑스",
        "마카오": "중국",
        "세인트빈센트그레나딘": "세인트빈센트",  
        "세인트크리스토퍼네비스": "세인트키츠네비스",
        "영령인도양섬": "영국"
        
    }

    cleaned_df = processed_df.select(
        when(col("Nationality").isin(exclude_list), lit(None)).otherwise(col("Nationality")).alias("Nationality")
        , "Year"
        , "Amount"
    ).replace(replacement_dict, subset = ['Nationality'])

    #cleaned_df = cleaned_df.replace(replacement_dict, subset = ['Nationality'])
    cleaned_df = cleaned_df.groupBy(["Nationality", "Year"]).agg(
        sum("Amount").alias("Amount")
    )
    return cleaned_df


In [0]:
def map_nationality(cleaned_df):
    # read csv file containing map country code
    df = pd.read_csv("/Workspace/Repos/o3oynyn@gmail.com/data-analysis_business-growth-opportunity/silver/Nationality_mapping.csv", index_col=None)

    # convert pandas df to spark df
    mapping_df = spark.createDataFrame(df)

    mapped_df = cleaned_df.join(mapping_df, \
                                cleaned_df['Nationality'] == mapping_df['Korean_Nationality'], \
                                how = "left")\
                            .select(
                                [col(c) for c in cleaned_df.columns] 
                                + [col(c) for c in mapping_df.columns if c != "Korean_Nationality"]
                            )
    return mapped_df
