### Step 1 - Start Spark Session

In [1]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder.appName("CircuitsIngestion").getOrCreate()

23/12/27 18:37:09 WARN Utils: Your hostname, falcao-sys resolves to a loopback address: 127.0.1.1; using 192.168.11.185 instead (on interface wlx7898e8c12476)
23/12/27 18:37:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/27 18:37:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Step 2: Define a Data Schema and Read the CSV file using the Spark Dataframe Reader

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

circuits_schema = StructType(fields=[
    StructField("circuitId", IntegerType(), False),
    StructField("circuitRef", StringType(), True),
    StructField("name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("country", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("lng", DoubleType(), True),
    StructField("alt", IntegerType(), True),
    StructField("url", StringType(), True)
])

In [3]:
circuits_df = spark\
    .read\
    .option("header", True)\
    .schema(circuits_schema)\
    .csv("../data/circuits.csv")

In [4]:
circuits_df.show(truncate=False)

+---------+--------------+------------------------------+------------+---------+--------+---------+---+-----------------------------------------------------------------+
|circuitId|circuitRef    |name                          |location    |country  |lat     |lng      |alt|url                                                              |
+---------+--------------+------------------------------+------------+---------+--------+---------+---+-----------------------------------------------------------------+
|1        |albert_park   |Albert Park Grand Prix Circuit|Melbourne   |Australia|-37.8497|144.968  |10 |http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_Circuit        |
|2        |sepang        |Sepang International Circuit  |Kuala Lumpur|Malaysia |2.76083 |101.738  |18 |http://en.wikipedia.org/wiki/Sepang_International_Circuit        |
|3        |bahrain       |Bahrain International Circuit |Sakhir      |Bahrain  |26.0325 |50.5106  |7  |http://en.wikipedia.org/wiki/Bahrain_Internatio

### Step 3: Selecting only the required columns

In [5]:
from pyspark.sql.functions import col

In [6]:
circuits_df = circuits_df.select(col("circuitId"), col("circuitRef"), col("name"), col("location"), col("country"), col("lat"), col("lng"), col("alt"))

In [7]:
circuits_df = circuits_df.select(
    circuits_df["circuitId"],
    circuits_df["circuitRef"],
    circuits_df["name"],
    circuits_df["location"],
    circuits_df["country"],
    circuits_df["lat"],
    circuits_df["lng"],
    circuits_df["alt"]
)

### Step 4: Renaming required's columns

In [8]:
circuits_df = circuits_df\
    .withColumnRenamed("circuitId", "circuit_id")\
    .withColumnRenamed("circuitRef", "circuit_ref")\
    .withColumnRenamed("lat", "latitude")\
    .withColumnRenamed("lng", "longitude")\
    .withColumnRenamed("alt", "altitude")

In [9]:
circuits_df.show(truncate=False)

+----------+--------------+------------------------------+------------+---------+--------+---------+--------+
|circuit_id|circuit_ref   |name                          |location    |country  |latitude|longitude|altitude|
+----------+--------------+------------------------------+------------+---------+--------+---------+--------+
|1         |albert_park   |Albert Park Grand Prix Circuit|Melbourne   |Australia|-37.8497|144.968  |10      |
|2         |sepang        |Sepang International Circuit  |Kuala Lumpur|Malaysia |2.76083 |101.738  |18      |
|3         |bahrain       |Bahrain International Circuit |Sakhir      |Bahrain  |26.0325 |50.5106  |7       |
|4         |catalunya     |Circuit de Barcelona-Catalunya|Montmeló    |Spain    |41.57   |2.26111  |109     |
|5         |istanbul      |Istanbul Park                 |Istanbul    |Turkey   |40.9517 |29.405   |130     |
|6         |monaco        |Circuit de Monaco             |Monte-Carlo |Monaco   |43.7347 |7.42056  |7       |
|7        

##### Step 5 - Add ingestion date to the dataframe

In [10]:
from pyspark.sql.functions import current_timestamp

circuits_df = circuits_df.withColumn("ingestion_date", current_timestamp())

##### Step 6 - Write data to a datalake as a parquet

In [12]:
circuits_df\
    .write\
    .mode("overwrite")\
    .parquet("parquet/circuits")

                                                                                