<a href="https://colab.research.google.com/github/drmartins2/EDIT_DE/blob/main/spark/challenges/challenge_2_drm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHALLENGE 2
##  Implement CLEANSING process
- Set up path in the "lake"
  - !mkdir -p /content/lake/silver

- Read data from BRONZE layer as PARQUET:
    - vehicles - path: /content/lake/bronze/vehicles
    - lines - path: /content/lake/bronze/lines
    - municipalities - path: /content/lake/bronze/municipalities

- Transformations
  - vehicles
    - rename "lat" and "lon" to "latitude" and "longitude" respectively
    - remove possible duplicates
    - remove rows when the column CURRENT_STATUS is null
    - remove any corrupted record
  - lines
    - remove duplicates
    - remove rows when the column LONG_NAME is null
    - remove any corrupted record
  - municipalities
    - remove duplicates
    - remove rows when the columns NAME or DISTRICT_NAME are null
    - remove any corrupted record

- Write data as PARQUET into the SILVER layer (/content/lake/silver)
  - Partition "vehicles" by "date"(created in the ingestion)
  - Paths:
    - vehicles - path: /content/lake/silver/vehicles
    - lines - path: /content/lake/silver/lines
    - municipalities - path: /content/lake/silver/municipalities

# Setting up PySpark

In [None]:
%pip install pyspark



In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Challenge 2').config('spark.ui.port', '4050').getOrCreate()

In [None]:
# Create folder for bronze layer
!mkdir -p /content/lake/bronze
print('Bronze layer created')

In [None]:
# Create folder for silver layer
!mkdir -p /content/lake/silver
print('Silver layer created')

# Define ETLFlow Class

In [None]:
from pyspark.sql import DataFrame
from pyspark.sql.types import *
import requests

class ETLFlow:
    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    # Extract data from API endpoint
    # Parameters:
    #   url: The API endpoint URL.
    #   schema: Optional schema to enforce on the JSON data
    # Returns:
    #   DataFrame containing the extracted data.
    def ReadAPI(self, url: str, schema: StructType = None) -> DataFrame:
        response = requests.get(url)  # Fetch data from the API
        rdd = self.spark.sparkContext.parallelize(response.json())  # Convert JSON response to RDD
        if schema:
            df = self.spark.read.schema(schema).json(rdd)  # Read RDD into DataFrame with schema
        else:
            df = self.spark.read.json(rdd)  # Read RDD into DataFrame without schema
        return df


    # Load a DataFrame into storage in the specified format.
    # Parameters:
    #   df: The DataFrame to be saved.
    #   format: The format to save the DataFrame in (e.g., parquet).
    #   path: The path where the DataFrame should be saved.
    #   partition_column: Optional column to partition the data by.
    def load(self, df: DataFrame, format: str, path: str, partition_column: str = None) -> None:
        if partition_column:
            # Save as a single file, partitioned by the specified column
            df.coalesce(1).write.mode("overwrite").partitionBy(partition_column).format(format).save(path)
        else:
            # Save as a single file without partitioning
            df.coalesce(1).write.mode("overwrite").format(format).save(path)

# ETL Task Ingestion for Bronze layer

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

class ETLTask_Bronze(ETLFlow):
    def __init__(self, spark: SparkSession) -> None:
        # Initialize ETLTask by inheriting from ETLFlow
        super().__init__(spark)


    # Ingest vehicle data from the API and load it into the bronze layer.
    #   Extract data from 'vehicles' endpoint.
    #   Add 'date' column derived from 'timestamp'.
    #   Saves data as parquet, partitioned by 'date'.
    def ingestion_vehicles(self):
        # Define schema for vehicle
        vehicle_schema = StructType([
            StructField('bearing', IntegerType(), True),
            StructField('block_id', StringType(), True),
            StructField('current_status', StringType(), True),
            StructField('id', StringType(), True),
            StructField('lat', FloatType(), True),
            StructField('line_id', StringType(), True),
            StructField('lon', FloatType(), True),
            StructField('pattern_id', StringType(), True),
            StructField('route_id', StringType(), True),
            StructField('schedule_relationship', StringType(), True),
            StructField('shift_id', StringType(), True),
            StructField('speed', FloatType(), True),
            StructField('stop_id', StringType(), True),
            StructField('timestamp', TimestampType(), True),
            StructField('trip_id', StringType(), True)
        ])

        # Extract data using defined schema
        df = self.ReadAPI(url="https://api.carrismetropolitana.pt/vehicles", schema=vehicle_schema)

        # Create "date" column from "timestamp"
        df = df.withColumn("date", date_format(col("timestamp"), "yyyy-MM-dd"))
        #df.show()

        # Load data into the bronze layer, partitioned by "date"
        self.load(df=df, format="parquet", path="/content/lake/bronze/vehicles", partition_column="date")


    # Ingest line data from the API and load it into the bronze layer.
    #   Extracts data from 'lines' endpoint.
    #   Saves data as parquet without partitioning.
    def ingestion_lines(self):
        # Define schema for lines
        lines_schema = StructType([
            StructField('_corrupt_record', StringType(), True),
            StructField('color', StringType(), True),
            StructField('facilities', ArrayType(StringType()), True),
            StructField('id', StringType(), True),
            StructField('localities', ArrayType(StringType()), True),
            StructField('long_name', StringType(), True),
            StructField('municipalities', ArrayType(StringType()), True),
            StructField('patterns', ArrayType(StringType()), True),
            StructField('routes', ArrayType(StringType()), True),
            StructField('short_name', StringType(), True),
            StructField('text_color', StringType(), True)
        ])

        # Extract data using defined schema
        df = self.ReadAPI(url="https://api.carrismetropolitana.pt/lines", schema=lines_schema)
        #df.show()

        # Load data into the bronze layer
        self.load(df=df, format="parquet", path="/content/lake/bronze/lines")


    # Ingest municipality data from the API and load it into the bronze layer.
    #   Extracts data from 'municipalities' endpoint.
    #   Saves data as parquet without partitioning.
    def ingestion_municipalities(self):
       # Define schema for municipalities
       municipalities_schema = StructType([
           StructField('district_name', StringType(), True),
           StructField('id', StringType(), True),
           StructField('name', StringType(), True),
           StructField('prefix', StringType(), True),
           StructField('region_id', StringType(), True),
           StructField('region_name', StringType(), True)
       ])


       # Extract data using defined schema
       df = self.ReadAPI(url="https://api.carrismetropolitana.pt/municipalities", schema=municipalities_schema)
       #df.show()

       # Load data into the bronze layer
       self.load(df=df, format="parquet", path="/content/lake/bronze/municipalities")

print('ETLTask class defined for specific API endpoint ingestion')

# ETL Task Cleansing for Silver layer

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

class ETLTask_Silver(ETLFlow):
    def __init__(self, spark: SparkSession) -> None:
        super().__init__(spark)

    def cleanse_vehicles(self):
        # Read data from bronze layer
        df = self.spark.read.parquet("/content/lake/bronze/vehicles")

        # Apply vehicle transformations
        df = df.withColumnRenamed("lat", "latitude") \ # Changes column name "lat" to "latitude"
               .withColumnRenamed("lon", "longitude") \ # Changes column name "lon" to "longitude"
               .dropDuplicates() \ # Removes duplicate records
               .filter(col("current_status").isNotNull()) \ # Remove rocords where "current_status" has null values
               .na.drop()  # Remove corrupted records - This removes any rows that contain null values in any column

        df.show()

        # Write to silver layer
        self.load(df=df, format="parquet", path="/content/lake/silver/vehicles", partition_column="date")

    def cleanse_lines(self):
        # Read data from bronze layer
        df = self.spark.read.parquet("/content/lake/bronze/lines")

        # Apply lines transformations
        df = df.dropDuplicates() \ # Removes duplicate records
               .filter(col("_corrupt_record").isNull()) \ # Remove rocords that have values in "_corrupt_record" column
               .drop("_corrupt_record") \ # Remove "_corrupt_record" column
               .na.drop()  # Remove any corrupted records - Removes any rows that contain null values in any column

        df.show()

        # Write to silver layer
        self.load(df=df, format="parquet", path="/content/lake/silver/lines")

    def cleanse_municipalities(self):
        # Read data from bronze layer
        df = self.spark.read.parquet("/content/lake/bronze/municipalities")

        # Apply municipalities transformations
        df = df.dropDuplicates() \ # Removes duplicate records
               .filter(col("name").isNotNull() & col("district_name").isNotNull()) \ # Only returns values where "name" and "district_name" aren't null
               .na.drop()  # Remove any corrupted records - Removes any rows that contain null values in any column

        df.show()

        # Write to silver layer
        self.load(df=df, format="parquet", path="/content/lake/silver/municipalities")

print('ETLTask class defined for bronze layer cleansing')

In [None]:
# Initialize ETL Bronze Process
print("Starting ETL program")
etl_bronze = ETLTask_Bronze(spark)

# Ingest vehicle data
print("Running Task - Ingestion Vehicles")
etl_bronze.ingestion_vehicles()

# Ingest line data
print("Running Task - Ingestion Lines")
etl_bronze.ingestion_lines()

# Ingest municipality data
print("Running Task - Ingestion Municipalities")
etl_bronze.ingestion_municipalities()

In [None]:
# Initialize ETL Silver Process
print("Starting Cleansing process")
etl_silver = ETLTask_Silver(spark)
print("Ingestion process completed")

# Cleanse vehicle data
print("Running Task - Cleansing Vehicles")
etl_silver.cleanse_vehicles()

# Cleanse lines data
print("Running Task - Cleansing Lines")
etl_silver.cleanse_lines()

# Cleanse municipalities data
print("Running Task - Cleansing Municipalities")
etl_silver.cleanse_municipalities()

print("Cleansing process completed")

print("ETL program completed")