<a href="https://colab.research.google.com/github/ducline/edit-data_processing/blob/main/spark/challenges/challenge_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHALLENGE 3
##  Implement ENRICH process
- Set up path in the "lake"
  - !mkdir -p /content/lake/gold

- Read data from SILVER layer
  - Paths:
    - vehicles - path: /content/lake/silver/vehicles
    - lines - path: /content/lake/silver/lines
    - municipalities - path: /content/lake/silver/municipalities
  - Use StructFields to enforce schema

- Enrichment
  - Enrich vehicles dataset with information from the line and municipalities
    - join vehicles with lines and municipalities
      - select all columns from vehicles + lines.long_name (name: line_name, format:string) + municipalities.name (name: municipality_name, format: array)
      - Note that "municipalities.name" is an array

- Write data as PARQUET into the GOLD layer (/content/lake/gold)
  - Dataset name: vehicles_enriched
  - Partition "vehicles_enriched" by "date" column
  - Paths:
    - vehicles - path: /content/lake/gold/vehicles_enriched
  - Make sure there is only 1 single parquet created
  - Use overwrite as write mode

# Setting up PySpark

In [1]:
%pip install pyspark



In [6]:
!mkdir -p /content/lake/silver/municipalities

In [5]:
!mkdir -p /content/lake/silver/lines

In [3]:
!mkdir -p /content/lake/silver/vehicles

In [7]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col, expr
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

class ETLFlow:

    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def extract(self, format: str, path: str, schema: StructType = None) -> DataFrame:
        if schema:
            return self.spark.read.format(format).schema(schema).load(path)
        return self.spark.read.format(format).load(path)

    def load(self, df: DataFrame, format: str, path: str, partition_column: str = None, **kwargs) -> None:
        if partition_column:
            df.coalesce(1).write.mode("overwrite").partitionBy(partition_column).format(format).save(path)
        else:
            df.coalesce(1).write.mode("overwrite").format(format).save(path)

class EnrichETLTask(ETLFlow):

    def __init__(self, spark: SparkSession) -> None:
        super().__init__(spark)

    def enrich_vehicles(self):
        # Define schemas
        vehicles_schema = StructType([
            StructField('bearing', StringType(), True),
            StructField('block_id', StringType(), True),
            StructField('current_status', StringType(), True),
            StructField('id', StringType(), True),
            StructField('lat', StringType(), True),
            StructField('line_id', StringType(), True),
            StructField('lon', StringType(), True),
            StructField('pattern_id', StringType(), True),
            StructField('route_id', StringType(), True),
            StructField('schedule_relationship', StringType(), True),
            StructField('shift_id', StringType(), True),
            StructField('speed', StringType(), True),
            StructField('stop_id', StringType(), True),
            StructField('timestamp', StringType(), True),
            StructField('trip_id', StringType(), True),
            StructField('date', StringType(), True)
        ])

        lines_schema = StructType([
            StructField('id', StringType(), True),
            StructField('long_name', StringType(), True),
        ])

        municipalities_schema = StructType([
            StructField('id', StringType(), True),
            StructField('name', ArrayType(StringType()), True),
        ])

        # Read datasets from SILVER layer
        vehicles_df = self.extract(format="parquet", path="/content/lake/silver/vehicles", schema=vehicles_schema)
        lines_df = self.extract(format="parquet", path="/content/lake/silver/lines", schema=lines_schema)
        municipalities_df = self.extract(format="parquet", path="/content/lake/silver/municipalities", schema=municipalities_schema)

        # Perform joins to enrich the vehicles dataset
        enriched_df = (
            vehicles_df
            .join(lines_df, vehicles_df.line_id == lines_df.id, "left")
            .join(municipalities_df, vehicles_df.line_id == municipalities_df.id, "left")
            .select(
                vehicles_df["*"],
                lines_df["long_name"].alias("line_name"),
                municipalities_df["name"].alias("municipality_name")
            )
        )

        # Write the enriched dataset to the GOLD layer
        self.load(
            df=enriched_df,
            format="parquet",
            path="/content/lake/gold/vehicles_enriched",
            partition_column="date"
        )

if __name__ == "__main__":
    spark = SparkSession.builder.master("local").appName("Enrich ETL Program").getOrCreate()

    print("Starting ENRICH ETL process")
    etl = EnrichETLTask(spark)

    print("Running Task - Enrich Vehicles")
    etl.enrich_vehicles()

    print("ENRICH ETL process completed")


Starting ENRICH ETL process
Running Task - Enrich Vehicles
ENRICH ETL process completed
