### Step 1: Start Spark Session

In [70]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from pyspark.sql.functions import concat, lit, to_timestamp, current_timestamp


# Initialize a Spark Session
spark = SparkSession.builder.appName("RacesIngestion").getOrCreate()

### Step2: Define a Data Schema and Read the CSV file using the Spark Dataframe Reader

In [71]:
races_schema = StructType(fields=[
    StructField("raceId", IntegerType(), False),
    StructField("year", IntegerType(), True),
    StructField("round", IntegerType(), True),
    StructField("circuitId", IntegerType(), False),
    StructField("name", StringType(), False),
    StructField("date", StringType(), False),
    StructField("time", StringType(), False),
    StructField("url", StringType(), True)
])

In [72]:
races_df = spark\
    .read\
    .option("header", True)\
    .schema(races_schema)\
    .csv("../data/races.csv")

### Step 2: Add ingestion date and race_timestamp to the datarame

In [73]:
races_df = races_df\
    .withColumn("ingestion_date", current_timestamp())\
    .withColumn("race_timestemp", 
        to_timestamp(
            concat(
                races_df["date"],
                lit(" "),
                races_df["time"]
            ),
            "yyyy-MM-dd HH:mm:ss"
        )
    )

### Step 3: Selecting only required columns

In [74]:
races_df = races_df.select(
    races_df["raceId"],
    races_df["year"],
    races_df["round"],
    races_df["circuitId"],
    races_df["name"],
    races_df["race_timestemp"]
)

### Step 4: Renaming columns

In [75]:
races_df = races_df\
    .withColumnRenamed("raceId", "race_id")\
    .withColumnRenamed("year", "race_year")\
    .withColumnRenamed("circuitId", "circuit_id")

### Step 5: Write a data to a Datalake as a Parquet

In [76]:
races_df\
    .write\
    .mode("overwrite")\
    .partitionBy("race_year")\
    .parquet("processed/races")

                                                                                