### Step 1 - Start Spark Session

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import current_timestamp

# Initialize a Spark session
spark = SparkSession.builder.appName("QualifyingIngestion").getOrCreate()

### Step 2 - Read the JSON file sing the Spark Dataframe Reader

In [8]:
qualifying_schema = StructType(fields=[
    StructField("qualifyId", IntegerType(), False),
    StructField("raceId", IntegerType(), False),
    StructField("driverId", IntegerType(), False),
    StructField("constructorId", IntegerType(), False),
    StructField("number", IntegerType(), False),
    StructField("position", IntegerType(), False),
    StructField("q1", StringType(), True),
    StructField("q2", StringType(), False),
    StructField("q3", StringType(), True)
])

In [9]:
qualifying_df = spark.read\
    .schema(qualifying_schema)\
    .option("multiLine", True)\
    .json("../data/qualifying/")

### Step 3 - Rename columns and add new columns

In [10]:
qualifying_df = qualifying_df\
    .withColumnRenamed("qualifyId", "qualify_id")\
    .withColumnRenamed("raceId", "race_id")\
    .withColumnRenamed("driverId", "driver_id")\
    .withColumn("ingestion_date", current_timestamp())

### Step 4 - Write to output to process container in Parquet Format

In [11]:
qualifying_df.write.mode("overwrite").parquet("../processed_data/qualifying")