###Ingest Drivers data from bronze to silver

0. Import configuration notebook and widgets

In [0]:
%run ../Includes/Configuration

In [0]:
dbutils.widgets.text('p_data_source', '', 'Data Source')
v_data_source = dbutils.widgets.get('p_data_source')

1. Imports

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.sql.functions import col, current_timestamp, concat, lit

2. Create schema

In [0]:
drivers_schema = StructType(
    fields=[
        StructField("driverId", IntegerType(), False),
        StructField("driverRef", StringType(), False),
        StructField("number", IntegerType(), True),
        StructField("code", StringType(), True),
        StructField("name", StructType(
            fields=[
                StructField('forename', StringType(), False),
                StructField('surname', StringType(), False)
            ]
        ), False),
        StructField('dob', DateType(), True),
        StructField('nationality', StringType(), True),
        StructField('url', StringType(), False)
    ]
)

3. Read drivers.json file

In [0]:
drivers_raw_df = spark.read \
    .format("json") \
    .schema(drivers_schema) \
    .load(f"{bronze_container_path}/drivers.json")

4. Transform the circuit dataframe

In [0]:
drivers_df = drivers_raw_df \
        .select(
                col("driverId").alias("driver_id"),
                col("driverRef").alias("driver_ref"),
                col("number"),
                col("code"),
                col("name.forename").alias("forename"),
                col("name.surname").alias("surname"),
                col("dob"),
                col("nationality")
        ) \
        .withColumn('name', concat(col("forename"), lit(" "), col("surname"))) \
       .withColumn('ingestion_timestamp', current_timestamp()) \
       .select('driver_id', 'driver_ref', 'number', 'code', 'name', 'dob', 'nationality', 'ingestion_timestamp') \
       .withColumn("data_source", lit(v_data_source))

5. Write the dataframe to silver layer

In [0]:
drivers_df.write.mode('overwrite').parquet(f"{silver_container_path}/drivers")