###Ingest races data from bronze to silver

Import configuration notebook and widgets

In [0]:
%run ../Includes/Configuration

In [0]:
dbutils.widgets.text('p_data_source', '', 'Data Source')
v_data_source = dbutils.widgets.get('p_data_source')

1. Imports

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.sql.functions import col, current_timestamp, to_timestamp, lit, concat, when

2. Create schema

In [0]:
races_schema = StructType(
    fields=[
        StructField("raceId", IntegerType(), False),
        StructField("year", IntegerType(), False),
        StructField("round", IntegerType(), False),
        StructField("circuitId", IntegerType(), False),
        StructField("name", StringType(), False),
        StructField("date", DateType(), False),
        StructField("time", StringType(), True),
        StructField("url", StringType(), True)
    ]
)

3. Read races.csv file

In [0]:
races_raw_df = spark.read \
    .option("header", True) \
    .schema(races_schema) \
    .csv(f"{bronze_container_path}/races.csv")

4. Transform the circuit dataframe

In [0]:
races_df = races_raw_df \
    .select(col("raceId").alias("race_id"),
            col("year").alias("race_year"),
            col("round"),
            col("circuitId").alias("circuit_id"),
            col("name"),
            col("date"),
            col("time")
    ) \
    .withColumn('race_timestamp', to_timestamp(concat(col("date"), lit(" "), when(((col('time') == '\\N') | (col('time') == '')), lit('00:00:00')).otherwise(col('time'))), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn('ingestion_date', current_timestamp()) \
    .withColumn('data_source', lit(v_data_source)) \
    .drop('date', 'time')

5. Write the dataframe to silver layer

In [0]:
races_df.write.mode('overwrite').partitionBy('race_year').parquet(f"{silver_container_path}/races")