In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, lit, trim, lower, when, coalesce, first, last,
    concat_ws, concat, udf, collect_list, array_join, explode,
    min as spark_min, max as spark_max, expr
)
from pyspark.sql.types import *
import pyspark.sql.functions as F

# ========== 1. Start spark ==========
spark = SparkSession.builder \
    .appName("F1 Pitstop Pipeline 2018-2024") \
    .getOrCreate()


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/21 14:58:54 WARN Utils: Your hostname, Divyanshs-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 10.33.74.20 instead (on interface en0)
25/11/21 14:58:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/21 14:58:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
conda install -c conda-forge openjdk=17

[1;33mJupyter detected[0m[1;33m...[0m
[1;32m2[0m[1;32m channel Terms of Service accepted[0m
Channels:
 - conda-forge
 - defaults
Platform: osx-arm64
doneecting package metadata (repodata.json): - 
doneing environment: / 


    current version: 25.7.0
    latest version: 25.9.1

Please update conda by running

    $ conda update -n base -c conda-forge conda



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [3]:
# ========== 2. Paths (update to your repo / S3 / HDFS) ==========
RAW_DIR = "./data/raw"           # relative to current directory
OUT_DIR = "./data/processed"     # final outputs here

# example files:
PIT1_PATH = f"{RAW_DIR}/pitstop_1st.csv"
PIT2_PATH = f"{RAW_DIR}/pitstop_2nd.csv"
SC_PATH   = f"{RAW_DIR}/safety_cars.csv"
RF_PATH   = f"{RAW_DIR}/red_flags.csv"

# optional lookups (you should download from Ergast/Kaggle or supply them)
RACES_PATH   = f"{RAW_DIR}/races.csv"    # maps raceId -> Season, Round, RaceName, Date
DRIVERS_PATH = f"{RAW_DIR}/drivers.csv"  # maps driverId -> driverName, abbreviation

Now Checking if the files are loaded correctly 

In [4]:
import os

print("Current working directory:", os.getcwd())
print("\nChecking file existence:")
files_to_check = {
    "Pitstops 1st": PIT1_PATH,
    "Pitstops 2nd": PIT2_PATH,
    "Safety Car": SC_PATH,
    "Red Flag": RF_PATH,
    "Races Lookup": RACES_PATH,
    "Drivers Lookup": DRIVERS_PATH
}

for name, path in files_to_check.items():
    exists = os.path.exists(path)
    print(f"{'yes' if exists else 'no'} {name}: {path}")
    
# Also check the raw directory contents
print(f"\nContents of {RAW_DIR}:")
try:
    if os.path.exists(RAW_DIR):
        for item in os.listdir(RAW_DIR):
            print(f"  - {item}")
    else:
        print(f"Directory {RAW_DIR} does not exist!")
except Exception as e:
    print(f"Error listing directory: {e}")

Current working directory: /Users/divyanshdoshi/Documents/GitHub/Cloud_Formula1_Data_Cleaning_Pipeline

Checking file existence:
yes Pitstops 1st: ./data/raw/pitstop_1st.csv
yes Pitstops 2nd: ./data/raw/pitstop_2nd.csv
yes Safety Car: ./data/raw/safety_cars.csv
yes Red Flag: ./data/raw/red_flags.csv
yes Races Lookup: ./data/raw/races.csv
yes Drivers Lookup: ./data/raw/drivers.csv

Contents of ./data/raw:
  - pitstop_1st.csv
  - safety_cars.csv
  - drivers.csv
  - red_flags.csv
  - races.csv
  - pitstop_2nd.csv


Now We can see that the dataset are correctly loaded now we can start cleaning and joining the dataset

Our main aim of this Pipeline is to get out clean pitstop there are many problems with the dataset like name issues and no data in someplace so we will tackle this using spark 
And for our final aim to use this data for predicting better pitstop strategy we all need to know when the pitstop was done under safety car which can change whole race dynamic and we have to change the whole race strategy

In [5]:

# Use inferSchema=False and define schema or let Spark infer; define options to handle encoding.
pit2 = spark.read.options(header=True, multiLine=True, escape='"').csv(PIT2_PATH)
pit1 = spark.read.options(header=True, multiLine=True, escape='"').csv(PIT1_PATH)
safety = spark.read.options(header=True).csv(SC_PATH)
redflag = spark.read.options(header=True).csv(RF_PATH)

# Optional lookups (if present)
races_df = None
drivers_df = None
try:
    races_df = spark.read.options(header=True).csv(RACES_PATH)
    drivers_df = spark.read.options(header=True).csv(DRIVERS_PATH)
except Exception:
    print("Lookups not provided yet; pipeline will attempt fuzzy joins or require you to add lookups.")


This code loads your raw data files (pit stops, safety car, red flag data, races, drivers) into Spark DataFrames so we can process them.



In [6]:
def basic_clean(df, col_names):
    # Trim whitespace and convert empty strings to null
    for c in col_names:
        if c in df.columns:
            df = df.withColumn(c, trim(col(c)))
            df = df.withColumn(c, when(col(c) == "", None).otherwise(col(c)))
    return df

pit2 = basic_clean(pit2, pit2.columns)
pit1 = basic_clean(pit1, pit1.columns)
safety = basic_clean(safety, safety.columns)
redflag = basic_clean(redflag, redflag.columns)

encoding_fixes = {
    "Kimi R√É∆í√Ç¬§ikk√É∆í√Ç¬∂nen": "Kimi Räikkönen",
    "RÃ¤ikkÃ¶nen": "Kimi Räikkönen"
}
fix_udf = F.udf(lambda s: encoding_fixes.get(s, s) if s is not None else None, StringType())
if "Driver" in pit2.columns:
    pit2 = pit2.withColumn("Driver", fix_udf(col("Driver")))

