### Step 1 - Start Spark Session

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql.functions import current_timestamp

# Initialize a Spark session
spark = SparkSession.builder.appName("LapTimes").getOrCreate()

### Step 2 - Read the foler with lap times files using the Spark Dataframe Reader

In [9]:
lap_times_schema = StructType(fields=[
    StructField("raceId", IntegerType(), False),
    StructField("driverId", IntegerType(), False),
    StructField("position", IntegerType(), False),
    StructField("lap", IntegerType(), False),
    StructField("time", StringType(), True),
    StructField("milliseconds", IntegerType(), True)
])

In [10]:
lap_times_df = spark.read\
    .schema(lap_times_schema)\
    .csv("../data/lap_times/lap_times_split*.csv")

In [11]:
lap_times_df.show()

+------+--------+--------+---+--------+------------+
|raceId|driverId|position|lap|    time|milliseconds|
+------+--------+--------+---+--------+------------+
|   841|      20|       1|  1|1:38.109|       98109|
|   841|      20|       2|  1|1:33.006|       93006|
|   841|      20|       3|  1|1:32.713|       92713|
|   841|      20|       4|  1|1:32.803|       92803|
|   841|      20|       5|  1|1:32.342|       92342|
|   841|      20|       6|  1|1:32.605|       92605|
|   841|      20|       7|  1|1:32.502|       92502|
|   841|      20|       8|  1|1:32.537|       92537|
|   841|      20|       9|  1|1:33.240|       93240|
|   841|      20|      10|  1|1:32.572|       92572|
|   841|      20|      11|  1|1:32.669|       92669|
|   841|      20|      12|  1|1:32.902|       92902|
|   841|      20|      13|  1|1:33.698|       93698|
|   841|      20|      14|  3|1:52.075|      112075|
|   841|      20|      15|  4|1:38.385|       98385|
|   841|      20|      16|  2|1:31.548|       

In [12]:
lap_times_df.count()

490904

### Step 3 - Rename columns and add new columns

In [13]:
lap_times_df = lap_times_df\
    .withColumnRenamed("raceId", "race_id")\
    .withColumnRenamed("driverId", "driver_id")\
    .withColumn("ingestion_date", current_timestamp())

### Step 4 - Write to output to process container in Parquet Format

In [14]:
lap_times_df.write.mode("overwrite").parquet("../processed_data/lap_times")

                                                                                