In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
%run "../includes/common_functions"

In [0]:
%run "../includes/configuration"

In [0]:
%fs
ls /mnt/bluetab1002/raw

path,name,size,modificationTime
dbfs:/mnt/bluetab1002/raw/circuits.csv,circuits.csv,10044,1738253538000
dbfs:/mnt/bluetab1002/raw/constructors.json,constructors.json,30415,1738253538000
dbfs:/mnt/bluetab1002/raw/drivers.json,drivers.json,180812,1738253538000
dbfs:/mnt/bluetab1002/raw/lap_times/,lap_times/,0,0
dbfs:/mnt/bluetab1002/raw/pit_stops.json,pit_stops.json,1369387,1738253539000
dbfs:/mnt/bluetab1002/raw/qualifying/,qualifying/,0,0
dbfs:/mnt/bluetab1002/raw/races.csv,races.csv,116847,1738253538000
dbfs:/mnt/bluetab1002/raw/results.json,results.json,7165641,1738253542000


In [0]:
display(spark.read.json("dbfs:/mnt/bluetab1002/raw/drivers.json"))

code,dob,driverId,driverRef,name,nationality,number,url
HAM,1985-01-07,1,hamilton,"List(Lewis, Hamilton)",British,44,http://en.wikipedia.org/wiki/Lewis_Hamilton
HEI,1977-05-10,2,heidfeld,"List(Nick, Heidfeld)",German,\N,http://en.wikipedia.org/wiki/Nick_Heidfeld
ROS,1985-06-27,3,rosberg,"List(Nico, Rosberg)",German,6,http://en.wikipedia.org/wiki/Nico_Rosberg
ALO,1981-07-29,4,alonso,"List(Fernando, Alonso)",Spanish,14,http://en.wikipedia.org/wiki/Fernando_Alonso
KOV,1981-10-19,5,kovalainen,"List(Heikki, Kovalainen)",Finnish,\N,http://en.wikipedia.org/wiki/Heikki_Kovalainen
NAK,1985-01-11,6,nakajima,"List(Kazuki, Nakajima)",Japanese,\N,http://en.wikipedia.org/wiki/Kazuki_Nakajima
BOU,1979-02-28,7,bourdais,"List(Sébastien, Bourdais)",French,\N,http://en.wikipedia.org/wiki/S%C3%A9bastien_Bourdais
RAI,1979-10-17,8,raikkonen,"List(Kimi, Räikkönen)",Finnish,7,http://en.wikipedia.org/wiki/Kimi_R%C3%A4ikk%C3%B6nen
KUB,1984-12-07,9,kubica,"List(Robert, Kubica)",Polish,88,http://en.wikipedia.org/wiki/Robert_Kubica
GLO,1982-03-18,10,glock,"List(Timo, Glock)",German,\N,http://en.wikipedia.org/wiki/Timo_Glock


In [0]:
from pyspark.sql.types import *

In [0]:
name_schema = StructType([
StructField("forename", StringType(), True),
StructField("surname", StringType(), True)    
])

drivers_schema = StructType([
StructField("driverId", IntegerType(), True),
StructField("driverRef", StringType(), True),
StructField("code", StringType(), True),
StructField("dob", DateType(), True),
StructField("name", name_schema, True),
StructField("nationality", StringType(), True),
StructField("number", IntegerType(), True),
StructField("url", StringType(), True)    
])
#drivers_schema = "code STRING, dob DATE, driverId INT, driverRef STRING, name STRING, nationality STRING, number INT, url STRING"

In [0]:
df = spark.read \
.schema(drivers_schema) \
.json(f"{raw_folder_path}/drivers.json")

#df.write.mode("overwrite").saveAsTable("f1_raw.drivers")

In [0]:
display(df)
df.printSchema()

driverId,driverRef,code,dob,name,nationality,number,url
1,hamilton,HAM,1985-01-07,"List(Lewis, Hamilton)",British,44.0,http://en.wikipedia.org/wiki/Lewis_Hamilton
2,heidfeld,HEI,1977-05-10,"List(Nick, Heidfeld)",German,,http://en.wikipedia.org/wiki/Nick_Heidfeld
3,rosberg,ROS,1985-06-27,"List(Nico, Rosberg)",German,6.0,http://en.wikipedia.org/wiki/Nico_Rosberg
4,alonso,ALO,1981-07-29,"List(Fernando, Alonso)",Spanish,14.0,http://en.wikipedia.org/wiki/Fernando_Alonso
5,kovalainen,KOV,1981-10-19,"List(Heikki, Kovalainen)",Finnish,,http://en.wikipedia.org/wiki/Heikki_Kovalainen
6,nakajima,NAK,1985-01-11,"List(Kazuki, Nakajima)",Japanese,,http://en.wikipedia.org/wiki/Kazuki_Nakajima
7,bourdais,BOU,1979-02-28,"List(Sébastien, Bourdais)",French,,http://en.wikipedia.org/wiki/S%C3%A9bastien_Bourdais
8,raikkonen,RAI,1979-10-17,"List(Kimi, Räikkönen)",Finnish,7.0,http://en.wikipedia.org/wiki/Kimi_R%C3%A4ikk%C3%B6nen
9,kubica,KUB,1984-12-07,"List(Robert, Kubica)",Polish,88.0,http://en.wikipedia.org/wiki/Robert_Kubica
10,glock,GLO,1982-03-18,"List(Timo, Glock)",German,,http://en.wikipedia.org/wiki/Timo_Glock


root
 |-- driverId: integer (nullable = true)
 |-- driverRef: string (nullable = true)
 |-- code: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- forename: string (nullable = true)
 |    |-- surname: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- number: integer (nullable = true)
 |-- url: string (nullable = true)



In [0]:
from pyspark.sql.functions import current_timestamp, col, concat, lit
df = df \
.withColumn('ingestion_date', current_timestamp()) \
.withColumnRenamed('driverId', 'driver_id') \
.withColumnRenamed('driverRef', 'driver_ref') \
.drop('url') \
.withColumn("data_source", lit(v_data_source)) \
.withColumn("name",concat(col("name.forename"), lit(' '), col("name.surname")))

In [0]:
display(df)

driver_id,driver_ref,code,dob,name,nationality,number,ingestion_date
1,hamilton,HAM,1985-01-07,Lewis Hamilton,British,44.0,2025-02-01T17:47:05.539Z
2,heidfeld,HEI,1977-05-10,Nick Heidfeld,German,,2025-02-01T17:47:05.539Z
3,rosberg,ROS,1985-06-27,Nico Rosberg,German,6.0,2025-02-01T17:47:05.539Z
4,alonso,ALO,1981-07-29,Fernando Alonso,Spanish,14.0,2025-02-01T17:47:05.539Z
5,kovalainen,KOV,1981-10-19,Heikki Kovalainen,Finnish,,2025-02-01T17:47:05.539Z
6,nakajima,NAK,1985-01-11,Kazuki Nakajima,Japanese,,2025-02-01T17:47:05.539Z
7,bourdais,BOU,1979-02-28,Sébastien Bourdais,French,,2025-02-01T17:47:05.539Z
8,raikkonen,RAI,1979-10-17,Kimi Räikkönen,Finnish,7.0,2025-02-01T17:47:05.539Z
9,kubica,KUB,1984-12-07,Robert Kubica,Polish,88.0,2025-02-01T17:47:05.539Z
10,glock,GLO,1982-03-18,Timo Glock,German,,2025-02-01T17:47:05.539Z


In [0]:
df.write.mode('overwrite').parquet(f"{process1_folder_path}/drivers")
#df.write.mode("overwrite").saveAsTable("f1_process.drivers")

In [0]:
%fs
ls dbfs:/mnt/bluetab1002/process1/drivers

path,name,size,modificationTime
dbfs:/mnt/bluetab1002/process1/drivers/_SUCCESS,_SUCCESS,0,1738432026000
dbfs:/mnt/bluetab1002/process1/drivers/_committed_5941438985986966631,_committed_5941438985986966631,123,1738324614000
dbfs:/mnt/bluetab1002/process1/drivers/_committed_9199977622264329636,_committed_9199977622264329636,232,1738432026000
dbfs:/mnt/bluetab1002/process1/drivers/_committed_vacuum5342410771354009032,_committed_vacuum5342410771354009032,96,1738432026000
dbfs:/mnt/bluetab1002/process1/drivers/_started_9199977622264329636,_started_9199977622264329636,0,1738432026000
dbfs:/mnt/bluetab1002/process1/drivers/part-00000-tid-9199977622264329636-68ed4097-3a7b-4e2e-b3cd-539a065a96df-52-1-c000.snappy.parquet,part-00000-tid-9199977622264329636-68ed4097-3a7b-4e2e-b3cd-539a065a96df-52-1-c000.snappy.parquet,29231,1738432026000


In [0]:
dbutils.notebook.exit("Success")