### Step 1 - Start Spark Session

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.sql.functions import concat, lit, current_timestamp

# Initialize a Spark session
spark = SparkSession.builder.appName("DriversIngestion").getOrCreate()

23/12/28 12:38:09 WARN Utils: Your hostname, falcao-sys resolves to a loopback address: 127.0.1.1; using 192.168.11.185 instead (on interface wlx7898e8c12476)
23/12/28 12:38:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/28 12:38:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/28 12:38:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Step 2 - Read the JSON file sing the Spark Dataframe Reader

In [2]:
name_schema = StructType(fields=[
    StructField("forename", StringType(), True),
    StructField("surname", StringType(), True)
])

drives_schema = StructType(fields=[
    StructField("driverId", StringType(), False),
    StructField("driverRef", StringType(), True),
    StructField("number", IntegerType(), True),
    StructField("code", StringType(), True),
    StructField("name", name_schema),
    StructField("nationality", StringType(), True),
    StructField("url", StringType(), True)
])

In [3]:
drives_df = spark\
    .read\
    .schema(drives_schema)\
    .json("../data/drivers.json")

### Step 3 - Rename columns and Add new columns

In [4]:
drives_df = drives_df\
    .withColumnRenamed("driverId", "driver_id")\
    .withColumnRenamed("driverRef", "driver_ref")\
    .withColumn("ingestion_date", current_timestamp())\
    .withColumn("name", concat(
        drives_df["name.forename"],
        lit(" "),
        drives_df["name.surname"]
    )
)

In [5]:
drives_df.show()

+---------+----------+------+----+------------------+-----------+--------------------+--------------------+
|driver_id|driver_ref|number|code|              name|nationality|                 url|      ingestion_date|
+---------+----------+------+----+------------------+-----------+--------------------+--------------------+
|        1|  hamilton|    44| HAM|    Lewis Hamilton|    British|http://en.wikiped...|2023-12-28 12:38:...|
|        2|  heidfeld|  NULL| HEI|     Nick Heidfeld|     German|http://en.wikiped...|2023-12-28 12:38:...|
|        3|   rosberg|     6| ROS|      Nico Rosberg|     German|http://en.wikiped...|2023-12-28 12:38:...|
|        4|    alonso|    14| ALO|   Fernando Alonso|    Spanish|http://en.wikiped...|2023-12-28 12:38:...|
|        5|kovalainen|  NULL| KOV| Heikki Kovalainen|    Finnish|http://en.wikiped...|2023-12-28 12:38:...|
|        6|  nakajima|  NULL| NAK|   Kazuki Nakajima|   Japanese|http://en.wikiped...|2023-12-28 12:38:...|
|        7|  bourdais|  NULL

### Step 4 - Drop unwanted columns

In [6]:
drives_df = drives_df.drop(drives_df["url"])

### Step 5 - Write to output to process container in Parquet Format

In [8]:
drives_df.write.mode("overwrite").parquet("../processed_data/drives")