In [22]:
from pyspark.sql import SparkSession, DataFrame
import pyspark.sql.functions as F

spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "1g").\
        config("spark.executor.cores", 1).\
        getOrCreate()
#dynamicaly overwrite partitions 
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

hdfs = "hdfs://namenode:8020"

def bronze_read(bpath: str) -> DataFrame:
    return spark.read.load(bpath)

def s_transformation(df: DataFrame) -> DataFrame:
    
    # get list of all fields in each message
    keys = df.\
        select(F.explode(F.col("Body"))).\
        select("key").\
        distinct().\
        rdd.flatMap(lambda x: x).collect()
    
    # explode dataframe to multiple columns
    exprs = [F.col("Body").getItem(k).alias(k) for k in keys]
    sdf = df.select(*exprs)
    
    s_cols = [
        "trip_id",
        "taxi_id",
        "trip_start_timestamp",
        "trip_seconds",
        "trip_miles",
        "fare",
        "tips",
        "tolls",
        "extras",
        "trip_total"
    ]
    
    # generate silver datafreme
    
    sdf = sdf.\
        select(*s_cols).\
        withColumn("created", F.to_date("trip_start_timestamp"))
    
    return sdf

def s_writer(sdf: DataFrame, spath, partitionBy = None, mode = "overwrite") -> None:
    sdf.\
        repartition("created").\
        write.\
        mode("overwrite").\
        partitionBy("created").\
        save("hdfs://namenode:8020/silver/chicago/taxi_trips")
    
df = spark.read.load(f"{hdfs}/bronze/chicago/taxi")

sdf = s_transformation(df)

s_writer(
    sdf=sdf,
    spath=f"{hdfs}/silver/chicago/taxi_trips",
    partitionBy="created"
)

                                                                                