In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("F1") \
    .getOrCreate()

# Download Dataset

1. Create the required directories:

In [3]:
!hdfs dfs -rm -r -f /f1/raw
!hdfs dfs -mkdir -p /f1/raw
!mkdir -p /data/dataset_cluster/f1

Deleted /f1/raw


2. Download the CSVs as ZIP: https://www.kaggle.com/datasets/rohanrao/formula-1-world-championship-1950-2020?resource=download
3. Move the ZIP to `/data/dataset_cluster/f1/f1.zip`
4. Unzip the file and move the contents to HDFS

In [4]:
!(cd /data/dataset_cluster/f1; unzip -o f1.zip)

Archive:  f1.zip
  inflating: circuits.csv            
  inflating: constructor_results.csv  
  inflating: constructor_standings.csv  
  inflating: constructors.csv        
  inflating: driver_standings.csv    
  inflating: drivers.csv             
  inflating: lap_times.csv           
  inflating: pit_stops.csv           
  inflating: qualifying.csv          
  inflating: races.csv               
  inflating: results.csv             
  inflating: seasons.csv             
  inflating: sprint_results.csv      
  inflating: status.csv              


In [5]:
!hdfs dfs -Ddfs.replication=1 -put -f /data/dataset_cluster/f1/*.csv /f1/raw/

In [6]:
!hdfs dfs -ls /f1/raw

Found 14 items
-rw-r--r--   1 cluster supergroup      10104 2025-04-27 20:55 /f1/raw/circuits.csv
-rw-r--r--   1 cluster supergroup     219365 2025-04-27 20:55 /f1/raw/constructor_results.csv
-rw-r--r--   1 cluster supergroup     317206 2025-04-27 20:55 /f1/raw/constructor_standings.csv
-rw-r--r--   1 cluster supergroup      17478 2025-04-27 20:55 /f1/raw/constructors.csv
-rw-r--r--   1 cluster supergroup     883771 2025-04-27 20:55 /f1/raw/driver_standings.csv
-rw-r--r--   1 cluster supergroup      94367 2025-04-27 20:55 /f1/raw/drivers.csv
-rw-r--r--   1 cluster supergroup   17622395 2025-04-27 20:55 /f1/raw/lap_times.csv
-rw-r--r--   1 cluster supergroup     443719 2025-04-27 20:55 /f1/raw/pit_stops.csv
-rw-r--r--   1 cluster supergroup     465231 2025-04-27 20:55 /f1/raw/qualifying.csv
-rw-r--r--   1 cluster supergroup     164344 2025-04-27 20:55 /f1/raw/races.csv
-rw-r--r--   1 cluster supergroup    1721961 2025-04-27 20:55 /f1/raw/results.csv
-rw-r--r--   1 cluster supergroup    

# Convert CSVs to Parquest

In [7]:
!hdfs dfs -mkdir -p /f1/data

In [8]:
hadoop = spark._jvm.org.apache.hadoop
fs = hadoop.fs.FileSystem
conf = hadoop.conf.Configuration()
path = hadoop.fs.Path('/f1/raw')
for f in fs.get(conf).listStatus(path):
    if not f.getLocalName().endswith(".csv"): continue
    parquet_filename = f.getLocalName().replace(".csv", ".parquet")
    df = spark.read.csv(f"/f1/raw/{f.getLocalName()}", header=True, inferSchema=True)
    df.write.mode('overwrite').parquet(f"/f1/data/{parquet_filename}")

                                                                                

In [9]:
!hdfs dfs -ls /f1/raw

Found 14 items
-rw-r--r--   1 cluster supergroup      10104 2025-04-27 20:55 /f1/raw/circuits.csv
-rw-r--r--   1 cluster supergroup     219365 2025-04-27 20:55 /f1/raw/constructor_results.csv
-rw-r--r--   1 cluster supergroup     317206 2025-04-27 20:55 /f1/raw/constructor_standings.csv
-rw-r--r--   1 cluster supergroup      17478 2025-04-27 20:55 /f1/raw/constructors.csv
-rw-r--r--   1 cluster supergroup     883771 2025-04-27 20:55 /f1/raw/driver_standings.csv
-rw-r--r--   1 cluster supergroup      94367 2025-04-27 20:55 /f1/raw/drivers.csv
-rw-r--r--   1 cluster supergroup   17622395 2025-04-27 20:55 /f1/raw/lap_times.csv
-rw-r--r--   1 cluster supergroup     443719 2025-04-27 20:55 /f1/raw/pit_stops.csv
-rw-r--r--   1 cluster supergroup     465231 2025-04-27 20:55 /f1/raw/qualifying.csv
-rw-r--r--   1 cluster supergroup     164344 2025-04-27 20:55 /f1/raw/races.csv
-rw-r--r--   1 cluster supergroup    1721961 2025-04-27 20:55 /f1/raw/results.csv
-rw-r--r--   1 cluster supergroup    

In [10]:
!hdfs dfs -ls /f1/data/**

Found 2 items
-rw-r--r--   2 cluster supergroup          0 2025-04-27 20:55 /f1/data/circuits.parquet/_SUCCESS
-rw-r--r--   2 cluster supergroup       9248 2025-04-27 20:55 /f1/data/circuits.parquet/part-00000-8b924ea1-b33f-4184-8526-163dfdc799eb-c000.snappy.parquet
Found 2 items
-rw-r--r--   2 cluster supergroup          0 2025-04-27 17:08 /f1/data/cleaned_circuits.parquet/_SUCCESS
-rw-r--r--   2 cluster supergroup       6904 2025-04-27 17:08 /f1/data/cleaned_circuits.parquet/part-00000-99e32ab9-9e0f-42b2-a6d3-42d4ef687d18-c000.snappy.parquet
Found 2 items
-rw-r--r--   2 cluster supergroup          0 2025-04-27 17:30 /f1/data/cleaned_constructor_results.parquet/_SUCCESS
-rw-r--r--   2 cluster supergroup      80657 2025-04-27 17:30 /f1/data/cleaned_constructor_results.parquet/part-00000-23998559-62d1-4e57-baa4-d4d192fefdfc-c000.snappy.parquet
Found 2 items
-rw-r--r--   2 cluster supergroup          0 2025-04-27 17:33 /f1/data/cleaned_constructor_standings.parquet/_SUCCESS
-rw-r--r--   