# Produce driver standings

## Pre-requisites
- Add file date parameter widget to the notebook and get it as a variable
- Run configuration notebook with folder paths
- Run common_functions notebook

In [0]:
dbutils.widgets.text("param_file_date", "")
var_file_date = dbutils.widgets.get("param_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

### Add ingestion date column to each data frame using current timestamp

- Find race years for which the date is to be reprocessed

In [0]:
race_results_df = spark.read.format("delta").load(f"{presentation_folder_path}/race_results") \
    .filter(f"file_date = '{var_file_date}'")

### Create a Delta Lake table or insert/update records to an existing Delta Lake table

In [0]:
race_year_list = df_column_to_list(race_results_df, 'race_year')

In [0]:
from pyspark.sql.functions import col

race_results_df = spark.read.format("delta").load(f"{presentation_folder_path}/race_results") \
    .filter(col("race_year").isin(race_year_list))

## Step 1 - Group data by year and driver

In [0]:
from pyspark.sql.functions import sum, count, when

driver_standings_df = race_results_df \
    .groupBy("race_year", "driver_name", "driver_nationality") \
    .agg(sum("points").alias("total_points"),
         count(when(col("position") == 1, True)).alias("wins"))

## Step 2 - Use window function to rank drivers by total points and wins per year

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc, rank

driver_rank_spec = Window.partitionBy("race_year").orderBy(desc("total_points"), desc("wins"))
final_df = driver_standings_df.withColumn("rank", rank().over(driver_rank_spec))

## Step 3 - Write data to Data Lake as a managed table in Delta Lake format

In [0]:
merge_condition = "tgt.driver_name = src.driver_name AND tgt.race_year = src.race_year"
merge_delta_data(final_df, 'f1_presentation', 'driver_standings', presentation_folder_path, merge_condition, 'race_year')