#### Ingest constructors.json file

#### Step 1 - Prep runtime + common parameters

In [0]:
# Set up runtime parameter to capture data source value

dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

#print(v_data_source)

In [0]:
# Set up runtime parameter to capture file extract date

dbutils.widgets.text("p_file_extract_date", "")
v_file_extract_date = dbutils.widgets.get("p_file_extract_date")

#print(v_file_extract_date)

In [0]:
# Run configuration notebook to get parameterised paths
# print(raw_folder_path)
# print(processed_folder_path)

# To use the variables, replace strings with f"strings" and wrap variable with curly braces
# NOTE: MAKE SURE TO PLACE %run COMMANDS ON INDIVIDUAL CELLS

In [0]:
%run "../includes/configuration"

In [0]:
# Run common_functions notebook to append ingestion_date

In [0]:
%run "../includes/common_functions"

In [0]:
print(raw_folder_path)

#### Step 2 - Read the JSON file using the spark dataframe reader

In [0]:
# Schema definition uses DDL-style rather than StructType which is more recommended

constructors_schema = "constructorId INT, constructorRef STRING, name STRING, nationality STRING, url STRING"

In [0]:
constructor_df = spark.read \
.schema(constructors_schema) \
.json(f"dbfs:{raw_folder_path}/{v_file_extract_date}/constructors.json")

In [0]:
# Verify dataframe

#display(constructor_df)
#constructor_df.printSchema()

#### Step 3 - Drop unwanted columns from the dataframe

In [0]:
from pyspark.sql.functions import col

In [0]:
# Uses the DataFrame.drop(col) API to remove the unrequired column.
# Recommended to use the dataframe alias for each field, especially when working with multiple dataframes

# constructor_dropped_df = constructor_df.drop(col('url'))
constructor_dropped_df = constructor_df.drop( constructor_df['url'] )

#### Step 4 - Rename columns and adding audit columns (ingestion date + data source)

In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
# Constructing final dataframe format with audit column
from pyspark.sql.functions import lit

constructor_final_df = add_ingestion_date(
                       constructor_dropped_df.withColumnRenamed("constructorId", "constructor_id") \
                                             .withColumnRenamed("constructorRef", "constructor_ref") \
                                             .withColumn("data_source",lit(v_data_source))
                                             .withColumn("file_extract_date",lit(v_file_extract_date)) 
                                           )
#display(constructor_final_df)

#### Step 5 - Write output to parquet file and create managed table -> Convert to delta as final step

In [0]:
# Ed: Converted to also directly save as external table at time the parquet file is generated
# <start>
#constructor_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/constructors")
#constructor_final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.constructors")
constructor_final_df.write.mode("overwrite").format("delta").saveAsTable("f1_processed.constructors")
# <end>

#display(constructor_final_df)

In [0]:
# To verify the parquet file, we can do an actual read on the parquet file
#display(spark.read.parquet(f"{processed_folder_path}/constructors") )

In [0]:
%sql
--Verifying external table via SQL
SELECT * FROM f1_processed.constructors;

constructor_id,constructor_ref,name,nationality,data_source,file_extract_date,ingestion_date
1,mclaren,McLaren,British,JON,2021-04-18,2022-07-30T17:02:23.134+0000
2,bmw_sauber,BMW Sauber,German,JON,2021-04-18,2022-07-30T17:02:23.134+0000
3,williams,Williams,British,JON,2021-04-18,2022-07-30T17:02:23.134+0000
4,renault,Renault,French,JON,2021-04-18,2022-07-30T17:02:23.134+0000
5,toro_rosso,Toro Rosso,Italian,JON,2021-04-18,2022-07-30T17:02:23.134+0000
6,ferrari,Ferrari,Italian,JON,2021-04-18,2022-07-30T17:02:23.134+0000
7,toyota,Toyota,Japanese,JON,2021-04-18,2022-07-30T17:02:23.134+0000
8,super_aguri,Super Aguri,Japanese,JON,2021-04-18,2022-07-30T17:02:23.134+0000
9,red_bull,Red Bull,Austrian,JON,2021-04-18,2022-07-30T17:02:23.134+0000
10,force_india,Force India,Indian,JON,2021-04-18,2022-07-30T17:02:23.134+0000


#### Step 6 - Return exit code upon completion -> Success

In [0]:
dbutils.notebook.exit("Success")

Success