### Step 1 - Start Spark Session and Include additional configurations and common functions

In [2]:
%run "../includes/configurations"

In [3]:
%run "../includes/common_functions"

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

# Initialize a Spark session
spark = SparkSession.builder.appName("CircuitsIngestion").getOrCreate()

23/12/29 10:57:02 WARN Utils: Your hostname, falcao-sys resolves to a loopback address: 127.0.1.1; using 192.168.11.185 instead (on interface wlx7898e8c12476)
23/12/29 10:57:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/29 10:57:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Step 2 - Define a Data Schema and Read the CSV file using the Spark Dataframe Reader

In [5]:
circuits_schema = StructType(fields=[
    StructField("circuitId", IntegerType(), False),
    StructField("circuitRef", StringType(), True),
    StructField("name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("country", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("lng", DoubleType(), True),
    StructField("alt", IntegerType(), True),
    StructField("url", StringType(), True)
])

In [6]:
circuits_df = spark\
    .read\
    .option("header", True)\
    .schema(circuits_schema)\
    .csv(f"{data_folder_path}/circuits.csv")

### Step 3 - Rename and drop columns, and add new columns

In [9]:
circuits_df = circuits_df\
    .withColumnRenamed("circuitId", "circuit_id")\
    .withColumnRenamed("circuitRef", "circuit_ref")\
    .withColumnRenamed("lat", "latitude")\
    .withColumnRenamed("lng", "longitude")\
    .withColumnRenamed("alt", "altitude")

In [8]:
circuits_df = circuits_df.drop(circuits_df["url"])

In [10]:
circuits_df = add_ingestion_date(circuits_df)

+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+
|circuit_id|   circuit_ref|                name|    location|  country|latitude|longitude|altitude|      ingestion_date|
+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10|2023-12-29 10:57:...|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18|2023-12-29 10:57:...|
|         3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|       7|2023-12-29 10:57:...|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|     109|2023-12-29 10:57:...|
|         5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|     130|2023-12-29 10:57:...|
|         6|        monaco|   Ci

In [7]:

#circuits_df = circuits_df.select(col("circuitId"), col("circuitRef"), col("name"), col("location"), col("country"), col("lat"), col("lng"), col("alt"))

### Step 4 - Write data to a Datalake as a Parquet

In [11]:
circuits_df.write.mode("overwrite").parquet(f"{processed_data_folder_path}/circuits")

                                                                                