<a href="https://colab.research.google.com/github/drmartins2/EDIT_DE/blob/main/spark/challenges/challenge_1_drm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHALLENGE 1
##  Implement INGESTION process
- Set up path in the "lake"
  - !mkdir -p /content/lake/bronze

- Read data from API https://api.carrismetropolitana.pt/
  - Endpoints:
    - vehicles
    - lines
    - municipalities
  - Use StructFields to enforce schema

- Transformations
  - vehicles
    - create "date" extracted from "timestamp" column (format: date - yyyy-mm-dd or yyyymmdd)

- Write data as PARQUET into the BRONZE layer (/content/lake/bronze)
  - Partition "vehicles" by "date" column
  - Paths:
    - vehicles - path: /content/lake/bronze/vehicles
    - lines - path: /content/lake/bronze/lines
    - municipalities - path: /content/lake/bronze/municipalities
  - Make sure there is only 1 single parquet created
  - Use overwrite as write mode

# Setting up PySpark

In [None]:
%pip install pyspark



In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Challenge 1').config('spark.ui.port', '4050').getOrCreate()

In [2]:
!mkdir -p /content/lake/bronze
print('Bronze layer created')

Bronze layer created


# Define ETLFlow Class

In [3]:
from pyspark.sql import DataFrame
from pyspark.sql.types import *
import requests

class ETLFlow:
    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    # Extract data from API endpoint
    # Parameters:
    #   url: The API endpoint URL.
    #   schema: Optional schema to enforce on the JSON data
    # Returns:
    #   DataFrame containing the extracted data.
    def ReadAPI(self, url: str, schema: StructType = None) -> DataFrame:
        response = requests.get(url)  # Fetch data from the API
        rdd = self.spark.sparkContext.parallelize(response.json())  # Convert JSON response to RDD
        if schema:
            df = self.spark.read.schema(schema).json(rdd)  # Read RDD into DataFrame with schema
        else:
            df = self.spark.read.json(rdd)  # Read RDD into DataFrame without schema
        return df


    # Load a DataFrame into storage in the specified format.
    # Parameters:
    #   df: The DataFrame to be saved.
    #   format: The format to save the DataFrame in (e.g., parquet).
    #   path: The path where the DataFrame should be saved.
    #   partition_column: Optional column to partition the data by.
    def load(self, df: DataFrame, format: str, path: str, partition_column: str = None) -> None:
        if partition_column:
            # Save as a single file, partitioned by the specified column
            df.coalesce(1).write.mode("overwrite").partitionBy(partition_column).format(format).save(path)
        else:
            # Save as a single file without partitioning
            df.coalesce(1).write.mode("overwrite").format(format).save(path)



Aux get endpoint schema

In [None]:
def ReadAPI(url: str, schema: StructType = None) -> DataFrame:
    response = requests.get(url)  # Fetch data from the API
    rdd = spark.sparkContext.parallelize(response.json())  # Convert JSON response to RDD
    if schema:
        df = spark.read.schema(schema).json(rdd)  # Read RDD into DataFrame with schema
    else:
        df = spark.read.json(rdd)  # Read RDD into DataFrame without schema
    return df

# Print the schema for a given endpoint
# Parameters:
#   df: The DataFrame whose schema is to be printed
#   endpoint: The name of the API endpoint
def print_schema(df: DataFrame, endpoint: str) -> None: # Removed self from the arguments
    print(f"\nSchema for {endpoint} endpoint:")
    df.printSchema()

# Vehicles
df = ReadAPI(url="https://api.carrismetropolitana.pt/vehicles", schema=None)
print_schema(df, "vehicles")

# Municipalities
df = ReadAPI(url="https://api.carrismetropolitana.pt/municipalities", schema=None)
print_schema(df, "municipalities")

# Lines
df = ReadAPI(url="https://api.carrismetropolitana.pt/lines", schema=None)
print_schema(df, "lines")


Schema for vehicles endpoint:
root
 |-- bearing: long (nullable = true)
 |-- block_id: string (nullable = true)
 |-- current_status: string (nullable = true)
 |-- id: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- line_id: string (nullable = true)
 |-- lon: double (nullable = true)
 |-- pattern_id: string (nullable = true)
 |-- route_id: string (nullable = true)
 |-- schedule_relationship: string (nullable = true)
 |-- shift_id: string (nullable = true)
 |-- speed: double (nullable = true)
 |-- stop_id: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- trip_id: string (nullable = true)


Schema for municipalities endpoint:
root
 |-- district_id: string (nullable = true)
 |-- district_name: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- prefix: string (nullable = true)
 |-- region_id: string (nullable = true)
 |-- region_name: string (nullable = true)


Schema for lines endpoint:
root
 |-- _corrup

# Define ETLTask Class

In [6]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

class ETLTask(ETLFlow):
    def __init__(self, spark: SparkSession) -> None:
        # Initialize ETLTask by inheriting from ETLFlow
        super().__init__(spark)


    # Ingest vehicle data from the API and load it into the bronze layer.
    #   Extract data from 'vehicles' endpoint.
    #   Add 'date' column derived from 'timestamp'.
    #   Saves data as parquet, partitioned by 'date'.
    def ingestion_vehicles(self):
        # Define schema for vehicle
        vehicle_schema = StructType([
            StructField('bearing', IntegerType(), True),
            StructField('block_id', StringType(), True),
            StructField('current_status', StringType(), True),
            StructField('id', StringType(), True),
            StructField('lat', FloatType(), True),
            StructField('line_id', StringType(), True),
            StructField('lon', FloatType(), True),
            StructField('pattern_id', StringType(), True),
            StructField('route_id', StringType(), True),
            StructField('schedule_relationship', StringType(), True),
            StructField('shift_id', StringType(), True),
            StructField('speed', FloatType(), True),
            StructField('stop_id', StringType(), True),
            StructField('timestamp', TimestampType(), True),
            StructField('trip_id', StringType(), True)
        ])

        # Extract data using defined schema
        df = self.ReadAPI(url="https://api.carrismetropolitana.pt/vehicles", schema=vehicle_schema)

        # Create "date" column from "timestamp"
        df = df.withColumn("date", date_format(col("timestamp"), "yyyy-MM-dd"))
        df.show()

        # Load data into the bronze layer, partitioned by "date"
        self.load(df=df, format="parquet", path="/content/lake/bronze/vehicles", partition_column="date")


    # Ingest line data from the API and load it into the bronze layer.
    #   Extracts data from 'lines' endpoint.
    #   Saves data as parquet without partitioning.
    def ingestion_lines(self):
        # Define schema for lines
        lines_schema = StructType([
            StructField('color', StringType(), True),
            StructField('facilities', ArrayType(StringType()), True),
            StructField('id', StringType(), True),
            StructField('localities', ArrayType(StringType()), True),
            StructField('long_name', StringType(), True),
            StructField('municipalities', ArrayType(StringType()), True),
            StructField('patterns', ArrayType(StringType()), True),
            StructField('routes', ArrayType(StringType()), True),
            StructField('short_name', StringType(), True),
            StructField('text_color', StringType(), True)
        ])

        # Extract data using defined schema
        df = self.ReadAPI(url="https://api.carrismetropolitana.pt/lines", schema=lines_schema)
        df.show()

        # Load data into the bronze layer
        self.load(df=df, format="parquet", path="/content/lake/bronze/lines")


    # Ingest municipality data from the API and load it into the bronze layer.
    #   Extracts data from 'municipalities' endpoint.
    #   Saves data as parquet without partitioning.
    def ingestion_municipalities(self):
       # Define schema for municipalities
       municipalities_schema = StructType([
           StructField('district_name', StringType(), True),
           StructField('id', StringType(), True),
           StructField('name', StringType(), True),
           StructField('prefix', StringType(), True),
           StructField('region_id', StringType(), True),
           StructField('region_name', StringType(), True)
       ])


       # Extract data using defined schema
       df = self.ReadAPI(url="https://api.carrismetropolitana.pt/municipalities", schema=municipalities_schema)
       df.show()

       # Load data into the bronze layer
       self.load(df=df, format="parquet", path="/content/lake/bronze/municipalities")

print('ETLTask class defined for specific API endpoint ingestion')

ETLTask class defined for specific API endpoint ingestion


# Execute ETL Process

In [7]:
# Initialize Spark session
spark = SparkSession.builder.master('local').appName('ETL Program').getOrCreate()

print("Starting ETL program")
etl = ETLTask(spark)

# Run tasks for each API endpoint

# Ingest vehicle data
print("Running Task - Ingestion Vehicles")
etl.ingestion_vehicles()

# Ingest line data
print("Running Task - Ingestion Lines")
etl.ingestion_lines()

# Ingest municipality data
print("Running Task - Ingestion Municipalities")
etl.ingestion_municipalities()

print("ETL program completed")

Starting ETL program
Running Task - Ingestion Vehicles
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+----------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|      date|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+----------+
|    229|20241126-64010212...| IN_TRANSIT_TO|44|12607| 38.53006|   4441|-8.883835|  4441_0_1|  4441_0|            SCHEDULED|112380234560|      7.5| 160403|2024-11-26 15:59:21|4441_0_1|2600|153...|2024-11-26|
|    179|20241126-64010065...| IN_TRANSIT_TO|44|12551| 38.64447|   4602|-9.036845|  4602_0_1|  4602_0|           