### Step 1 - Start Spark Session and Include additional configurations and common functions

In [1]:
%run "../includes/configurations"

In [2]:
%run "../includes/common_functions"

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import concat, lit, current_timestamp

# Initialize a Spark session
spark = SparkSession.builder.appName("DriversIngestion").getOrCreate()

23/12/29 11:22:58 WARN Utils: Your hostname, falcao-sys resolves to a loopback address: 127.0.1.1; using 192.168.11.185 instead (on interface wlx7898e8c12476)
23/12/29 11:22:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/29 11:22:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Step 2 - Read the JSON file sing the Spark Dataframe Reader

In [4]:
name_schema = StructType(fields=[
    StructField("forename", StringType(), True),
    StructField("surname", StringType(), True)
])

drives_schema = StructType(fields=[
    StructField("driverId", IntegerType(), False),
    StructField("driverRef", StringType(), True),
    StructField("number", IntegerType(), True),
    StructField("code", StringType(), True),
    StructField("name", name_schema),
    StructField("nationality", StringType(), True),
    StructField("url", StringType(), True)
])

In [5]:
drives_df = spark.read.schema(drives_schema).json("../data/drivers.json")

### Step 3 - Rename and drop columns, and add new columns

In [6]:
drives_column_mapping = {
    "driverId": "driver_id",
    "driverRef": "driver_ref"
}

drives_df = rename_columns(drives_df, drives_column_mapping)


In [7]:
columns_to_drop = ['url']

drives_df = drop_columns(drives_df, columns_to_drop)

In [8]:
drives_df = add_ingestion_date(drives_df)
drives_df = drives_df.withColumn("name", concat(
        drives_df["name.forename"],
        lit(" "),
        drives_df["name.surname"]
    )
)

### Step 4 - Write to output to process container in Parquet Format

In [9]:
drives_df.write.mode("overwrite").parquet("../processed_data/drives")

                                                                                