### Ingest circuits.csv file

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date","2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

##### Step 1 - Read the CSV file using the spark dataframe reader

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [0]:
circuits_schema = StructType(fields=[StructField("circuitId",IntegerType(),False),
                                     StructField("circuitRef",StringType(),True),
                                     StructField("name",StringType(),True),
                                     StructField("location",StringType(),True),
                                     StructField("country",StringType(),True),
                                     StructField("lat",DoubleType(),True),
                                     StructField("lng",DoubleType(),True),
                                     StructField("alt",IntegerType(),True),
                                     StructField("url",StringType(),True)])

In [0]:
circuits_df = spark.read \
.option("header",True) \
.schema(circuits_schema) \
.csv(f"{raw_folder_path}/{v_file_date}/circuits.csv")

##### Step 2 - Select only the required columns

In [0]:
from pyspark.sql.functions import col

circuits_selected_df = circuits_df.select(col("circuitId"),col("circuitRef"),col("name"),col("location"),col("country"),col("lat"),col("lng"),col("alt"))

##### Step 3 - remane the columns as required

In [0]:
from pyspark.sql.functions import lit

circuits_renamed_df = circuits_selected_df.withColumnRenamed("circuitId","circuit_id") \
.withColumnRenamed("circuitRef","circuit_ref") \
.withColumnRenamed("lat","latitude") \
.withColumnRenamed("lng","longitude") \
.withColumnRenamed("alt","altitude") \
.withColumn("data_source",lit(v_data_source)) \
.withColumn("file_date", lit(v_file_date))

##### Step 4 - Add new column as required

In [0]:
circuits_final_df = add_ingestion_date(circuits_renamed_df)

##### Step 5 - Write data to datalake as delta format

In [0]:
full_write_delta_data(circuits_final_df,"f1_processed","circuits")

In [0]:
dbutils.notebook.exit("Success")

Success

In [0]:
%sql
--SELECT * FROM f1_processed.circuits

circuit_id,circuit_ref,name,location,country,latitude,longitude,altitude,data_source,file_date,ingestion_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,teste,2021-03-21,2022-04-21T15:58:44.514+0000
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,teste,2021-03-21,2022-04-21T15:58:44.514+0000
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,teste,2021-03-21,2022-04-21T15:58:44.514+0000
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,teste,2021-03-21,2022-04-21T15:58:44.514+0000
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,teste,2021-03-21,2022-04-21T15:58:44.514+0000
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,teste,2021-03-21,2022-04-21T15:58:44.514+0000
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,teste,2021-03-21,2022-04-21T15:58:44.514+0000
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,teste,2021-03-21,2022-04-21T15:58:44.514+0000
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,teste,2021-03-21,2022-04-21T15:58:44.514+0000
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,teste,2021-03-21,2022-04-21T15:58:44.514+0000
