<a href="https://colab.research.google.com/github/ducline/edit-data_processing/blob/main/spark/challenges/challenge_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHALLENGE 4
##  Analyze data

- Query table "vehicles_enriched" in gold layer
- Aggregate data by municipality_name (array)
- Calculate:
  - count of vehicles (id) that pass through that municipality
  - sum speed of vehicles

Questions:
  - What are the top 3 municipalities by vehicles routes?
  - What are the top 3 municipalities with higher vehicle speed on average?


Tips:
- explode array into rows -> https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.explode.html


# Setting up PySpark

In [None]:
%pip install pyspark



In [None]:
!mkdir -p /content/lake/silver/vehicles

In [None]:
!mkdir -p /content/lake/silver/lines

In [None]:
!mkdir -p /content/lake/silver/municipalities

In [3]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
import pyspark.sql.functions as F
from pyspark.sql.types import *
import requests
import os

class ETLFlow:

    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def extract_from_file(self, format: str, path: str, **kwargs) -> DataFrame:
        df = self.spark.read.format(format).load(path)
        return df

    def extract_from_api(self, url: str, schema: StructType = None):
        response = requests.get(url)
        rdd = self.spark.sparkContext.parallelize(response.json())

        if schema:
            df = self.spark.read.schema(schema).json(rdd)
        else:
            df = self.spark.read.json(rdd)
        return df

    def load(self, df: DataFrame, format: str, path: str, partition_column: str = None, **kwargs) -> None:
        if partition_column:
            df.coalesce(1).write.mode("overwrite").partitionBy(partition_column).format(format).save(path)
        else:
            df.coalesce(1).write.mode("overwrite").format(format).save(path)

class ETLTask(ETLFlow):

    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def ingestion_lines(self):
        lines_schema = StructType([StructField('color', StringType(), True),
                                   StructField('facilities', ArrayType(StringType(), True), True),
                                   StructField('id', StringType(), True),
                                   StructField('localities', ArrayType(StringType(), True), True),
                                   StructField('long_name', StringType(), True),
                                   StructField('municipalities', ArrayType(StringType(), True), True),
                                   StructField('patterns', ArrayType(StringType(), True), True),
                                   StructField('routes', ArrayType(StringType(), True), True),
                                   StructField('short_name', StringType(), True), StructField('text_color', StringType(), True)])

        df = self.extract_from_api(url="https://api.carrismetropolitana.pt/lines", schema=lines_schema)
        self.load(df=df, format="parquet", path="/content/lake/bronze/lines")

    def ingestion_vehicles(self):
        vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                                    StructField('block_id', StringType(), True),
                                    StructField('current_status', StringType(), True),
                                    StructField('id', StringType(), True),
                                    StructField('lat', FloatType(), True),
                                    StructField('line_id', StringType(), True),
                                    StructField('lon', FloatType(), True),
                                    StructField('pattern_id', StringType(), True),
                                    StructField('route_id', StringType(), True),
                                    StructField('schedule_relationship', StringType(), True),
                                    StructField('shift_id', StringType(), True),
                                    StructField('speed', FloatType(), True),
                                    StructField('stop_id', StringType(), True),
                                    StructField('timestamp', TimestampType(), True),
                                    StructField('trip_id', StringType(), True)])

        df = self.extract_from_api(url="https://api.carrismetropolitana.pt/vehicles", schema=vehicle_schema)
        df = df.withColumn("date", expr("date(timestamp)"))
        self.load(df=df, format="parquet", path="/content/lake/bronze/vehicles", partition_column="date")

    def ingestion_municipalities(self):
        municipalities_schema = StructType([StructField('name', StringType(), True),
                                           StructField('district_name', StringType(), True),
                                           StructField('id', StringType(), True)])

        df = self.extract_from_api(url="https://api.carrismetropolitana.pt/municipalities", schema=municipalities_schema)
        self.load(df=df, format="parquet", path="/content/lake/bronze/municipalities")

    def cleansing_vehicles(self):
        df = self.extract_from_file(format="parquet", path="/content/lake/bronze/vehicles")

        # Transformations
        df = df.withColumnRenamed("lat", "latitude").withColumnRenamed("lon", "longitude")
        df = df.drop_duplicates()
        df = df.filter(df.current_status.isNotNull())
        df = df.filter(df.latitude.isNotNull() & df.longitude.isNotNull())  # Remove corrupted records (if any)

        self.load(df=df, format="parquet", path="/content/lake/silver/vehicles", partition_column="date")

    def cleansing_lines(self):
        df = self.extract_from_file(format="parquet", path="/content/lake/bronze/lines")

        # Transformations
        df = df.drop_duplicates()
        df = df.filter(df.long_name.isNotNull())  # Remove rows where long_name is null
        df = df.filter(df.id.isNotNull())  # Remove corrupted records (if any)

        self.load(df=df, format="parquet", path="/content/lake/silver/lines")

    def cleansing_municipalities(self):
        df = self.extract_from_file(format="parquet", path="/content/lake/bronze/municipalities")

        # Transformations
        df = df.drop_duplicates()
        df = df.filter(df.name.isNotNull() & df.district_name.isNotNull())  # Remove rows with null name or district_name
        df = df.filter(df.id.isNotNull())  # Remove corrupted records (if any)

        self.load(df=df, format="parquet", path="/content/lake/silver/municipalities")

    def enrich(self):
        # Read data from the Silver layer
        vehicles_df = self.extract_from_file(format="parquet", path="/content/lake/silver/vehicles")
        lines_df = self.extract_from_file(format="parquet", path="/content/lake/silver/lines")

        # Enrich vehicles with information from lines
        enriched_df = vehicles_df \
            .join(lines_df, vehicles_df.line_id == lines_df.id, "left") \
            .select(
                vehicles_df["*"],
                lines_df["long_name"].alias("line_name"),
                lines_df["municipalities"].alias("municipality_name")  # Keep municipalities
            )

        # Write the enriched data to the Gold layer
        self.load(df=enriched_df, format="parquet", path="/content/lake/gold/vehicles_enriched", partition_column="date")

    def analyze_data(self):
        # Read the enriched vehicles data from the Gold layer
        vehicles_enriched_df = self.extract_from_file(format="parquet", path="/content/lake/gold/vehicles_enriched")

        # Explode the municipalities array to create rows for each municipality
        exploded_df = vehicles_enriched_df.withColumn("municipality", F.explode(vehicles_enriched_df.municipality_name))

        # Aggregate data by municipality
        aggregated_df = exploded_df.groupBy("municipality") \
            .agg(
                F.count("id").alias("vehicle_count"),  # Count of vehicles passing through the municipality
                F.sum("speed").alias("total_speed"),  # Sum of vehicle speeds
                F.avg("speed").alias("average_speed")  # Average speed of vehicles
            )

        # Top 3 municipalities by vehicle routes (most vehicles passing through)
        top_municipalities_by_vehicles = aggregated_df.orderBy(F.desc("vehicle_count")).limit(3)

        # Top 3 municipalities with the highest average speed
        top_municipalities_by_speed = aggregated_df.orderBy(F.desc("average_speed")).limit(3)

        # Show the results
        print("Top 3 Municipalities by Vehicle Routes:")
        top_municipalities_by_vehicles.show()

        print("Top 3 Municipalities with Higher Vehicle Speed on Average:")
        top_municipalities_by_speed.show()

if __name__ == '__main__':
    # Set up Gold layer directories
    os.makedirs("/content/lake/gold/vehicles_enriched", exist_ok=True)

    # init spark
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.master('local').appName('ETL Program').getOrCreate()

    print("Starting ETL program")
    etl = ETLTask(spark)

    # run tasks
    print("Running Task - Ingestion Vehicles")
    etl.ingestion_vehicles()

    print("Running Task - Ingestion Lines")
    etl.ingestion_lines()

    print("Running Task - Ingestion Municipalities")
    etl.ingestion_municipalities()

    print("Running Task - Cleansing Vehicles")
    etl.cleansing_vehicles()

    print("Running Task - Cleansing Lines")
    etl.cleansing_lines()

    print("Running Task - Cleansing Municipalities")
    etl.cleansing_municipalities()

    print("Running Task - Enrich Vehicles")
    etl.enrich()

    print("Running Task - Analyze Data")
    etl.analyze_data()

    print("ETL program completed")


Starting ETL program
Running Task - Ingestion Vehicles
Running Task - Ingestion Lines
Running Task - Ingestion Municipalities
Running Task - Cleansing Vehicles
Running Task - Cleansing Lines
Running Task - Cleansing Municipalities
Running Task - Enrich Vehicles
Running Task - Analyze Data
Top 3 Municipalities by Vehicle Routes:
+------------+-------------+------------------+-----------------+
|municipality|vehicle_count|       total_speed|    average_speed|
+------------+-------------+------------------+-----------------+
|        1106|          376|2303.3333300352097|6.125886516051089|
|        1111|          242|1394.7222220301628|5.763314967066789|
|        1107|          195| 958.8888865113258|4.917378905186286|
+------------+-------------+------------------+-----------------+

Top 3 Municipalities with Higher Vehicle Speed on Average:
+------------+-------------+------------------+------------------+
|municipality|vehicle_count|       total_speed|     average_speed|
+------------+