### Step 1 - Start Spark Session and Include additional configurations and common functions

In [11]:
%run "../includes/configurations"

In [12]:
%run "../includes/common_functions"

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

# Initialize a Spark session
spark = SparkSession.builder.appName("CircuitsIngestion").getOrCreate()

### Step 2 - Define a Data Schema and Read the CSV file using the Spark Dataframe Reader

In [14]:
circuits_schema = StructType(fields=[
    StructField("circuitId", IntegerType(), False),
    StructField("circuitRef", StringType(), True),
    StructField("name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("country", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("lng", DoubleType(), True),
    StructField("alt", IntegerType(), True),
    StructField("url", StringType(), True)
])

In [15]:
circuits_df = spark.read.option("header", True).schema(circuits_schema).csv(f"{data_folder_path}/circuits.csv")

### Step 3 - Rename and drop columns, and add new columns

In [16]:
circuits_column_mapping = {
    "circuitId": "circuit_id",
    "circuitRef": "circuit_ref",
    "lat": "latitude",
    "lng": "longitude",
    "alt": "altitude"
}

circuits_df = rename_columns(circuits_df, circuits_column_mapping)

In [17]:
circuits_df = circuits_df.drop(circuits_df["url"])

In [18]:
circuits_df = add_ingestion_date(circuits_df)

In [19]:

#circuits_df = circuits_df.select(col("circuitId"), col("circuitRef"), col("name"), col("location"), col("country"), col("lat"), col("lng"), col("alt"))

### Step 4 - Write data to a Datalake as a Parquet

In [20]:
circuits_df.write.mode("overwrite").parquet(f"{processed_data_folder_path}/circuits")

                                                                                