In [0]:
%run ../residents/_udf_utils_residents

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_extract, when, lit, sum, current_timestamp, make_date

In [0]:
spark = SparkSession.builder.appName("silver_visitors").getOrCreate()

In [0]:
# read data
bronze_df = spark.table("workspace.growth_poc.bronze_visitors")

key_columns = ["국적", "목적"]
# get all non-key columns -> will be used for dynamic sql
columns_to_be_unpivoted = ["`"+c+"`" for c in bronze_df.columns if c not in key_columns]

# build the dynamic column list 
unpivot_columns = ", ".join(columns_to_be_unpivoted)

dynamic_sql = f"""
SELECT REPLACE(`국적`, " ", "") AS nationality,
    REPLACE(`목적`, " ", "") AS purpose,
    year_month,
    amount
FROM workspace.growth_poc.bronze_visitors v 
UNPIVOT (
    amount for year_month in ({unpivot_columns})
)
"""

# Execute the dynamic SQL
unpivoted_df = spark.sql(dynamic_sql).filter((col("year_month") != "계") & \
                                            (col("purpose") != "소계")) # exclude total value


In [0]:
# currently, year_month looks like yyyy년mm월. We need to convert it into date
extract_datepart_df = unpivoted_df.withColumn(
    "year",
    regexp_extract(col("year_month"), r"^(\d{4}|\d{2})", 1).cast('int') # get 2 or 4 digits from the start
    )\
    .withColumn(
        "month",
        regexp_extract(col("year_month"), r"\D+(\d{1,2})", 1).cast('int') # get 1~2 digists followed after string
    ).withColumn("amount", col("amount").cast("long")) # cast amount as long

# if the year is only two digits, add 20 at the front 
extract_datepart_df = extract_datepart_df.withColumn(
    "year"
    , when(col("year") <= 999, col("year") + 2000).otherwise(col("year")))

# create date from year and month
clean_date_df = extract_datepart_df.withColumn("date", make_date(col("year"), col("month"), lit("1")))

In [0]:
# clean nationality and map with country code and language code
clean_nationality_df = clean_nationality(clean_date_df)
mapping_df = map_nationality(clean_nationality_df)

In [0]:
# remove grouping values
nationality_to_remove = [
"미주", 
"아프리카주",
"아시아주",
"구주",
"중동",
"전체",
"대양주",
]

remove_grouping_df = mapping_df.filter(~col("Nationality").isin(nationality_to_remove))

# aggregate data accordingly
grouped_df = remove_grouping_df.groupBy("Nationality","Year", "English_Nationality", "ISO_Code", "Primary_Language", "Language_Code").agg(sum("Amount").alias("Amount"))

In [0]:
# add timestamp
final_df = grouped_df.withColumn("TimeStamp", current_timestamp())

final_df.write \
        .format("delta") \
        .mode("overwrite")\
        .option("mergeSchema", True)\
        .saveAsTable("workspace.growth_poc.silver_visitors")