# Ingest circuits.csv file

##### Step 1 - Read the CSV file using the spark dataframe reader

In [0]:
display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/mnt/formula1dl612/presentation,abfss://presentation@formula1dl612.dfs.core.windows.net/,
/mnt/formula1dl612/processed,abfss://processed@formula1dl612.dfs.core.windows.net/,
/Volumes,UnityCatalogVolumes,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/databricks-results,databricks-results,
/mnt/formula1dl612/raw,abfss://raw@formula1dl612.dfs.core.windows.net/,
/databricks/mlflow-registry,databricks/mlflow-registry,
/Volume,DbfsReserved,
/mnt/formula1dl612/demo,abfss://demo@formula1dl612.dfs.core.windows.net/,


In [0]:
%fs
ls mnt/formula1dl612/raw

path,name,size,modificationTime
dbfs:/mnt/formula1dl612/raw/.DS_Store,.DS_Store,10244,1708622927000
dbfs:/mnt/formula1dl612/raw/circuits.csv,circuits.csv,10044,1708622973000
dbfs:/mnt/formula1dl612/raw/constructors.json,constructors.json,30415,1708622927000
dbfs:/mnt/formula1dl612/raw/drivers.json,drivers.json,180812,1708622927000
dbfs:/mnt/formula1dl612/raw/lap_times/,lap_times/,0,1708623008000
dbfs:/mnt/formula1dl612/raw/pit_stops.json,pit_stops.json,1369387,1708622928000
dbfs:/mnt/formula1dl612/raw/qualifying/,qualifying/,0,1708623019000
dbfs:/mnt/formula1dl612/raw/races.csv,races.csv,116847,1708622973000
dbfs:/mnt/formula1dl612/raw/results.json,results.json,7165641,1708622929000


In [0]:
circuits_df = spark.read.csv("dbfs:/mnt/formula1dl612/raw/circuits.csv")

In [0]:
type(circuits_df)

Out[3]: pyspark.sql.dataframe.DataFrame

In [0]:
# Header should be fixed
circuits_df.show()

+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|      _c0|           _c1|                 _c2|         _c3|      _c4|     _c5|      _c6|_c7|                 _c8|
+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|circuitId|    circuitRef|                name|    location|  country|     lat|      lng|alt|                 url|
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968| 10|http://en.wikiped...|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738| 18|http://en.wikiped...|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|  7|http://en.wikiped...|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|109|http://en.wikiped...|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517| 

In [0]:
display(circuits_df.head(5))

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8
circuitId,circuitRef,name,location,country,lat,lng,alt,url
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_Circuit
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,http://en.wikipedia.org/wiki/Sepang_International_Circuit
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_International_Circuit
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcelona-Catalunya


In [0]:
# Fix header
# https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameReader.csv.html#pyspark.sql.DataFrameReader.csv
# https://spark.apache.org/docs/3.1.3/api/python/reference/api/pyspark.sql.DataFrameReader.csv.html
# Add .option("header", True) before .csv("dbfs:/...")
circuits_df = spark.read.option("header", True).csv("dbfs:/mnt/formula1dl612/raw/circuits.csv")

In [0]:
display(circuits_df.head(5))

circuitId,circuitRef,name,location,country,lat,lng,alt,url
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_Circuit
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,http://en.wikipedia.org/wiki/Sepang_International_Circuit
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_International_Circuit
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcelona-Catalunya
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,http://en.wikipedia.org/wiki/Istanbul_Park


In [0]:
circuits_df.printSchema()

root
 |-- circuitId: string (nullable = true)
 |-- circuitRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)
 |-- alt: string (nullable = true)
 |-- url: string (nullable = true)



In [0]:
circuits_df.describe().show()

+-------+------------------+----------+-------+---------+---------+------------------+-----------------+-----------------+--------------------+
|summary|         circuitId|circuitRef|   name| location|  country|               lat|              lng|              alt|                 url|
+-------+------------------+----------+-------+---------+---------+------------------+-----------------+-----------------+--------------------+
|  count|                77|        77|     77|       77|       77|                77|               77|               77|                  77|
|   mean|              39.0|      null|   null|     null|     null| 33.72035103896102|3.551302597402597|247.4935064935065|                null|
| stddev|22.371857321197094|      null|   null|     null|     null|22.885969000074535| 64.8766790440326|363.2672505910991|                null|
|    min|                 1|       BAK|A1-Ring|Abu Dhabi|Argentina|          -22.9756|        -0.331667|               -7|http://en.wiki

###### A schema is the description of the structure of your data (which together create a Dataset in Spark SQL). It can be implicit (and inferred at runtime) or explicit (and known at compile time).

In [0]:
# inferSchema : str or bool, optional
# infers the input schema automatically from data. It requires one extra pass over the data. If None is set, it uses the default value, false.

circuits_df = spark.read \
.option("header", True) \
.option("inferSchema", True) \
.csv("dbfs:/mnt/formula1dl612/raw/circuits.csv")

In [0]:
display(circuits_df.head(5))

circuitId,circuitRef,name,location,country,lat,lng,alt,url
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_Circuit
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,http://en.wikipedia.org/wiki/Sepang_International_Circuit
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_International_Circuit
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcelona-Catalunya
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,http://en.wikipedia.org/wiki/Istanbul_Park


In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [0]:
# CircuitId is a primary key
circuits_schema = StructType(fields=[StructField("circuitId", IntegerType(), False),
                                     StructField("circuitRef", StringType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("location", StringType(), True),
                                     StructField("country", StringType(), True),
                                     StructField("lat", DoubleType(), True),
                                     StructField("lng", DoubleType(), True),
                                     StructField("alt", IntegerType(), True),
                                     StructField("url", StringType(), True)
])

In [0]:
circuits_df = spark.read \
.option("header", True) \
.schema(circuits_schema) \
.csv("dbfs:/mnt/formula1dl612/raw/circuits.csv")

In [0]:
circuits_df.printSchema()

root
 |-- circuitId: integer (nullable = true)
 |-- circuitRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lng: double (nullable = true)
 |-- alt: integer (nullable = true)
 |-- url: string (nullable = true)



#### Select only the required columns

In [0]:
circuits_selected_df = circuits_df.select("circuitId", "circuitRef", "name", "location", "country", "lat", "lng", "alt")
display(circuits_selected_df.head(5))

circuitId,circuitRef,name,location,country,lat,lng,alt
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130


In [0]:
circuits_selected_df = circuits_df.select(circuits_df.circuitId, circuits_df.circuitRef, circuits_df.name, circuits_df.location, circuits_df.country, circuits_df.lat, circuits_df.lng, circuits_df.alt)


In [0]:
circuits_selected_df = circuits_df.select(circuits_df["circuitId"],
                                          circuits_df["circuitRef"],
                                          circuits_df["name"],
                                          circuits_df["location"],
                                          circuits_df["country"],
                                          circuits_df["lat"],
                                          circuits_df["lng"],
                                          circuits_df["alt"])

In [0]:
from pyspark.sql.functions import col

In [0]:
# Change country column to race_country
circuits_selected_df = circuits_df.select(col("circuitId"),
                                          col("circuitRef"),
                                          col("name"),
                                          col("location"),
                                          col("country") \
                                          #.alias("race_country")
                                          ,
                                          col("lat"),
                                          col("lng"),
                                          col("alt"))
display(circuits_selected_df.head(5))

circuitId,circuitRef,name,location,country,lat,lng,alt
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130


#####Step 3 - Rename the columns as required

In [0]:
circuits_renamed_df = circuits_selected_df.withColumnRenamed("circuitId", "circuit_id") \
.withColumnRenamed("circuitRef", "circuit_ref") \
.withColumnRenamed("lat", "latitude") \
.withColumnRenamed("lng", "longtitue") \
.withColumnRenamed("alt", "altitude")
display(circuits_renamed_df.head(5))      

circuit_id,circuit_ref,name,location,country,latitude,longtitue,altitude
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130


#####Step 4 - Add ingestion date to the dataframe

In [0]:
from pyspark.sql.functions import current_timestamp, lit

In [0]:
circuits_final_df = circuits_renamed_df.withColumn("ingestion_date", current_timestamp()) \
.withColumn("env",lit( "Production"))

In [0]:
display(circuits_final_df.head(5))

circuit_id,circuit_ref,name,location,country,latitude,longtitue,altitude,ingestion_date,env
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,2024-02-23T14:50:30.346+0000,Production
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,2024-02-23T14:50:30.346+0000,Production
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,2024-02-23T14:50:30.346+0000,Production
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,2024-02-23T14:50:30.346+0000,Production
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,2024-02-23T14:50:30.346+0000,Production


In [0]:
circuits_final_df = circuits_renamed_df.withColumn("ingestion_date", current_timestamp())

#####Step 5 - Write datalake as parquet

In [0]:
circuits_final_df.write.mode("overwrite").parquet("/mnt/formula1dl612/processed/circuits")

In [0]:
%fs
ls /mnt/formula1dl612/processed/circuits

path,name,size,modificationTime
dbfs:/mnt/formula1dl612/processed/circuits/_committed_1154076571966783487,_committed_1154076571966783487,222,1708699831000
dbfs:/mnt/formula1dl612/processed/circuits/_committed_8267251785614308885,_committed_8267251785614308885,232,1708642357000
dbfs:/mnt/formula1dl612/processed/circuits/_committed_9073018565150424399,_committed_9073018565150424399,123,1708642221000
dbfs:/mnt/formula1dl612/processed/circuits/_committed_vacuum5579625262302098317,_committed_vacuum5579625262302098317,96,1708699832000
dbfs:/mnt/formula1dl612/processed/circuits/_started_1154076571966783487,_started_1154076571966783487,0,1708699831000
dbfs:/mnt/formula1dl612/processed/circuits/part-00000-tid-1154076571966783487-6f7896a5-0cdd-4ad8-8485-cf1c490078ef-242-1-c000.snappy.parquet,part-00000-tid-1154076571966783487-6f7896a5-0cdd-4ad8-8485-cf1c490078ef-242-1-c000.snappy.parquet,7840,1708699831000


In [0]:
display(spark.read.parquet("/mnt/formula1dl612/processed/circuits").head(5))

circuit_id,circuit_ref,name,location,country,latitude,longtitue,altitude,ingestion_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,2024-02-23T14:50:31.118+0000
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,2024-02-23T14:50:31.118+0000
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,2024-02-23T14:50:31.118+0000
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,2024-02-23T14:50:31.118+0000
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,2024-02-23T14:50:31.118+0000
