### Step 1 - Start Spark Session

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql.functions import current_timestamp

# Initialize a Spark session
spark = SparkSession.builder.appName("ResultsIngestion").getOrCreate()

### Step 2 - Read the JSON file sing the Spark Dataframe Reader

In [8]:
results_schema = StructType(fields=[
    StructField("resultId", IntegerType(), False),
    StructField("raceId", IntegerType(), False),
    StructField("driverId", IntegerType(), False),
    StructField("constructorId", IntegerType(), False),
    StructField("number", IntegerType(), True),
    StructField("grid", IntegerType(), False),
    StructField("position", IntegerType(), True),
    StructField("positionText", IntegerType(), False),
    StructField("positionOrder", StringType(), False),
    StructField("points", FloatType(), False),
    StructField("laps", IntegerType(), False),
    StructField("time", StringType(), True),
    StructField("milliseconds", IntegerType(), True),
    StructField("fastestLap", IntegerType(), True),
    StructField("rank", IntegerType(), True),
    StructField("fastestLapTime", StringType(), True),
    StructField("fastestLapSpeed", StringType(), True),
    StructField("statusId", IntegerType(), False),
])

In [9]:
results_df = spark.read.schema(results_schema).json("../data/results.json")

### Step 3 - Add new columns, Rename and Drop

In [10]:
results_df = results_df\
    .withColumn("ingestion_date", current_timestamp())

In [11]:
results_df = results_df.drop(results_df["statusId"])

In [12]:
results_df = results_df\
    .withColumnRenamed("resultId", "result_id")\
    .withColumnRenamed("raceId", "race_id")\
    .withColumnRenamed("driverId", "driver_id")\
    .withColumnRenamed("constructorId", "constructor_id")\
    .withColumnRenamed("positionText", "position_text")\
    .withColumnRenamed("positionOrder", "position_order")\
    .withColumnRenamed("fastestLap", "fastest_lap")\
    .withColumnRenamed("fastestLapTime", "fastest_lap_time")\
    .withColumnRenamed("fastestLapSpeed", "fastest_lap_speed")

### Step 4 - Write to output to process container in Parquet Format

In [13]:
results_df.write.mode("overwrite").partitionBy("race_id").parquet("../processed_data/results")

                                                                                