### Ingest lap_times_split_X.csv file

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date","2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

##### Step 1 - Read the multiply CSVs files using the spark dataframe reader

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
fields = [StructField("raceId",IntegerType(),False),
          StructField("driverId",IntegerType(),True),
          StructField("lap",IntegerType(),True),
          StructField("position",IntegerType(),True),
          StructField("time",StringType(),True),
          StructField("milliseconds",IntegerType(),True)
         ]
lap_times_schema = StructType(fields = fields)

In [0]:
lap_times_df = spark.read \
.schema(lap_times_schema) \
.csv(f"{raw_folder_path}/{v_file_date}/lap_times/lap_times_split_*.csv")

##### Step 2 - Create new ingestion_date column and rename required columns

In [0]:
from pyspark.sql.functions import lit

lap_times_final_df = lap_times_df \
.withColumnRenamed("raceId","race_id") \
.withColumnRenamed("driverId","driver_id") \
.withColumn("data_source",lit(v_data_source)) \
.withColumn("file_date", lit(v_file_date))

lap_times_final_df = add_ingestion_date(lap_times_final_df)

##### Step 3 - White dataframe in parquet format

In [0]:
merge_condition = "target.race_id = source.race_id and target.driver_id = source.driver_id and target.lap = source.lap"
merge_delta_data(lap_times_final_df, "f1_processed", "lap_times", processed_folder_path, merge_condition, "race_id")

In [0]:
dbutils.notebook.exit("Success")

Success

In [0]:
%sql
/*select race_id,count(1) 
  from f1_processed.lap_times
 group by race_id
 order by race_id desc*/

race_id,count(1)
1053,1124
1052,1026
1047,1043
1046,1531
1045,1016
1044,1076
1043,1128
1042,1288
1041,1017
1040,946
