## Handling the Gold transformation



In [0]:
%sql

USE CATALOG `demo-taxi`;
CREATE SCHEMA IF NOT EXISTS `demo-taxi`.`gold-layer`;


In [0]:
%sql
SELECT * FROM `demo-taxi`.`silver-layer`.`all_trips` LIMIT 10;


In [0]:
%sql

SELECT   pickup_hour
        ,COUNT(*) as total_trips
          ,SUM(fare_amount) as total_fare
          ,avg(fare_amount) as avg_fare
   FROM `demo-taxi`.`silver-layer`.`all_trips`
GROUP BY pickup_hour;

In [0]:
from pyspark.sql.functions import col, lit, sum, count, avg,round

(spark
    .read.table("`demo-taxi`.`silver-layer`.`all_trips`")
    .groupBy("taxi_type","pickup_hour")
    .agg(
        count("*").alias("total_trips"),
        round(sum("fare_amount"), 4).alias("total_fare"),
        round(avg("fare_amount"),4).alias("avg_fare")
    )
    .write
    .mode("overwrite")
    .option("mergeSchema", "true")
    .saveAsTable("`demo-taxi`.`gold-layer`.`hourly_fare_statistics`")
)   


In [0]:
%sql
SELECT * FROM `demo-taxi`.`gold-layer`.`hourly_fare_statistics`

In [0]:
(
spark.table("`demo-taxi`.`silver-layer`.`all_trips`")
    .groupBy("taxi_type","pickup_date")
    .agg(
        count("*").alias("total_trips"),
        round(sum("fare_amount"), 4).alias("total_fare"),
        round(sum("tip_amount"), 4).alias("total_tip"),
        round(avg("tip_amount"),4).alias("avg_tip"),
        round(avg("trip_distance"),4).alias("avg_distance"),
        round(avg("trip_duration_minutes"),4).alias("avg_duration"),
        round(avg("fare_amount"),4).alias("avg_fare")
    )
    .write
    .mode("overwrite")
    .option("mergeSchema", "true")
    .saveAsTable("`demo-taxi`.`gold-layer`.`daily_fare_statistics`")
)


In [0]:
%sql
SELECT * FROM `demo-taxi`.`gold-layer`.`daily_fare_statistics`