### Step 1 - Start Spark Session and Include additional configurations and common functions

In [1]:
%run "../includes/configurations"

In [2]:
%run "../includes/common_functions"

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.sql.functions import concat, lit, to_timestamp


# Initialize a Spark Session
spark = SparkSession.builder.appName("RacesIngestion").getOrCreate()

23/12/29 11:12:03 WARN Utils: Your hostname, falcao-sys resolves to a loopback address: 127.0.1.1; using 192.168.11.185 instead (on interface wlx7898e8c12476)
23/12/29 11:12:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/29 11:12:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Step2 - Define a Data Schema and Read the CSV file using the Spark Dataframe Reader

In [4]:
races_schema = StructType(fields=[
    StructField("raceId", IntegerType(), False),
    StructField("year", IntegerType(), True),
    StructField("round", IntegerType(), True),
    StructField("circuitId", IntegerType(), False),
    StructField("name", StringType(), False),
    StructField("date", DateType(), False),
    StructField("time", StringType(), False),
    StructField("url", StringType(), True)
])

In [5]:
races_df = spark\
    .read\
    .option("header", True)\
    .schema(races_schema)\
    .csv("../data/races.csv")

### Step 3 - Rename and drop columns, and add new columns

In [6]:
races_df = races_df\
    .withColumnRenamed("raceId", "race_id")\
    .withColumnRenamed("year", "race_year")\
    .withColumnRenamed("circuitId", "circuit_id")

In [7]:
races_df = races_df.drop(races_df["url"])

In [8]:
races_df = add_ingestion_date(races_df)
races_df = races_df.withColumn(
    "race_timestemp", 
    to_timestamp(
        concat(
            races_df["date"],
            lit(" "),
            races_df["time"]
        ),
        "yyyy-MM-dd HH:mm:ss"
    )
)

### Step 4 - Write a data to a Datalake as a Parquet

In [9]:
races_df.write.mode("overwrite").parquet(f"{processed_data_folder_path}/races")

                                                                                