In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import rand, lead, when, col, year, avg, desc, min
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("F1_Lap_Prediction") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.2.20") \
    .config("spark.driver.port", "7078") \
    .config("spark.driver.host", "pyspark") \
    .config("spark.driver.bindAddress", "0.0.0.0") \
    .getOrCreate()


In [3]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [4]:
jdbc_url = "jdbc:postgresql://postgres:5432/mydatabase"
jdbc_properties = {
    "user": "admin",
    "password": "admin_password",
    "driver": "org.postgresql.Driver"
}
table_name = "f1_consolidated.features"
df = spark.read.jdbc(url=jdbc_url, table=table_name, properties=jdbc_properties)

In [5]:

# 1. Crear la columna 'year_col' a partir de 'date'
df = df.withColumn("year_col", year("date"))

# 2. Calcular el mínimo de 'current_lap_time' para cada circuito, año y piloto
min_lap_df = df.groupBy("circuit_key", "year_col", "driver_number") \
               .agg(min("current_lap_time").alias("min_lap_time"))

# 3. Hacer join para obtener 'min_lap_time' en cada fila
df = df.join(min_lap_df, on=["circuit_key", "year_col", "driver_number"], how="inner")

# 4. Filtrar las vueltas que sean mayores al 120% del mínimo
df = df.filter(col("current_lap_time") <= 1.20 * col("min_lap_time"))

# 5. (Opcional) Eliminar columnas que no se necesiten
df = df.drop("min_lap_time", "year_col")

In [6]:
# Limpieza de datos
df = df.na.drop(subset=["current_lap_time", "previous_lap_time", "lap_time_delta", "accumulated_time"])
df = df.filter(df.current_tire != "TEST_UNKNOWN")

In [7]:
# Agregar columna "next lap", que será la que se va a predecir


# Definir la ventana por sesión y piloto, ordenando por número de vuelta
windowSpec = Window.partitionBy("session_key", "driver_number").orderBy("lap_number")

# Crear la columna next_lap_time:
# Se asigna el valor de current_lap_time de la siguiente vuelta únicamente si en esa siguiente vuelta
# el contador de vueltas sobre el neumático actual aumenta en 1, lo que indica que es el mismo stint.
df = df.withColumn(
    "next_lap_time",
    when(
        lead("laps_on_current_tire").over(windowSpec) == col("laps_on_current_tire") + 1,
        lead("current_lap_time").over(windowSpec)
    ).otherwise(None)
)

# Filtrar las filas que no tienen next_lap_time (por ejemplo, la última vuelta o cuando hubo cambio de stint)
df_filtered = df.filter(col("next_lap_time").isNotNull())

# Mostrar algunas filas para verificar el cálculo
df_filtered.select("session_key", "driver_number", "lap_number", "current_lap_time", "laps_on_current_tire", "next_lap_time").show(10)

+-----------+-------------+----------+--------------------+--------------------+--------------------+
|session_key|driver_number|lap_number|    current_lap_time|laps_on_current_tire|       next_lap_time|
+-----------+-------------+----------+--------------------+--------------------+--------------------+
|       7763|            1|        20|97.99600000000000...|                   2|97.47200000000000...|
|       7763|            1|        21|97.47200000000000...|                   3|97.98600000000000...|
|       7763|            1|        22|97.98600000000000...|                   4|97.91300000000000...|
|       7763|            1|        23|97.91300000000000...|                   5|97.89800000000000...|
|       7763|            1|        24|97.89800000000000...|                   6|97.87500000000000...|
|       7763|            1|        25|97.87500000000000...|                   7|98.01500000000000...|
|       7763|            1|        26|98.01500000000000...|                   8|97

In [8]:
# Indexar columna "current tire" para incluírla en el vector ensamblado

indexer = StringIndexer(
    inputCol="current_tire",
    outputCol="current_tire_index"
)

In [9]:
indexerModel = indexer.fit(df_filtered)

In [10]:
indexed_df = indexerModel.transform(df_filtered)

In [11]:
indexed_df.show()

+-----------+-------------+--------------------+-----------+----------+--------------------+-------------------------+------------+--------------------+---------+--------------------+--------------------+--------------------+----------------+---------------------------+--------------------+--------------------+--------------------+-------------+------------------------+------------+--------------------+--------------------+------------------+
|circuit_key|driver_number|                date|session_key|lap_number|    current_lap_time|race_percentage_completed|current_tire|laps_on_current_tire|box_stops|   previous_lap_time|      lap_time_delta|    accumulated_time|position_in_race|time_difference_with_leader|       sector_1_time|       sector_2_time|       sector_3_time|box_stop_time|time_since_last_box_stop|session_type|          created_at|       next_lap_time|current_tire_index|
+-----------+-------------+--------------------+-----------+----------+--------------------+--------------

In [12]:
encoder = OneHotEncoder(
    inputCols=["current_tire_index"],
    outputCols=["current_tire_vec"]
)

In [13]:
encoded_df = encoder.fit(indexed_df).transform(indexed_df)

In [14]:
encoded_df.show(truncate=False)

+-----------+-------------+-----------------------+-----------+----------+---------------------+-------------------------+------------+--------------------+---------+-----------------------+------------------------+------------------------+----------------+---------------------------+---------------------+---------------------+---------------------+-------------+------------------------+------------+--------------------------+----------------------+------------------+----------------+
|circuit_key|driver_number|date                   |session_key|lap_number|current_lap_time     |race_percentage_completed|current_tire|laps_on_current_tire|box_stops|previous_lap_time      |lap_time_delta          |accumulated_time        |position_in_race|time_difference_with_leader|sector_1_time        |sector_2_time        |sector_3_time        |box_stop_time|time_since_last_box_stop|session_type|created_at                |next_lap_time         |current_tire_index|current_tire_vec|
+-----------+-------

In [15]:
assembler = VectorAssembler(
    inputCols=["previous_lap_time","current_lap_time", "lap_time_delta", "accumulated_time", "laps_on_current_tire", "box_stops", "current_tire_vec"],
    outputCol="features"
)
encoded_df = assembler.transform(encoded_df)

In [16]:
# Visualizamos algunas filas para verificar
encoded_df.select("current_tire", "current_tire_index", "current_tire_vec", "features").show(10)

+------------+------------------+----------------+--------------------+
|current_tire|current_tire_index|current_tire_vec|            features|
+------------+------------------+----------------+--------------------+
|      MEDIUM|               1.0|   (5,[1],[1.0])|(11,[0,1,2,3,4,7]...|
|      MEDIUM|               1.0|   (5,[1],[1.0])|(11,[0,1,2,3,4,7]...|
|      MEDIUM|               1.0|   (5,[1],[1.0])|(11,[0,1,2,3,4,7]...|
|      MEDIUM|               1.0|   (5,[1],[1.0])|(11,[0,1,2,3,4,7]...|
|      MEDIUM|               1.0|   (5,[1],[1.0])|(11,[0,1,2,3,4,7]...|
|      MEDIUM|               1.0|   (5,[1],[1.0])|(11,[0,1,2,3,4,7]...|
|      MEDIUM|               1.0|   (5,[1],[1.0])|(11,[0,1,2,3,4,7]...|
|      MEDIUM|               1.0|   (5,[1],[1.0])|(11,[0,1,2,3,4,7]...|
|      MEDIUM|               1.0|   (5,[1],[1.0])|(11,[0,1,2,3,4,7]...|
|      MEDIUM|               1.0|   (5,[1],[1.0])|(11,[0,1,2,3,4,7]...|
+------------+------------------+----------------+--------------

In [17]:
# Separamos los datos para no hacer overfit

# 1. Extraer combinaciones únicas
unique_sessions_df = df_filtered.select("circuit_key", "session_key").distinct()

# 2. Asignar aleatoriamente cada (circuit_key, session_key) a train o test
#    Usamos rand(seed=99) para reproducibilidad
unique_sessions_df = unique_sessions_df.withColumn("rand_val", rand(seed=99))

train_sessions = unique_sessions_df.filter("rand_val <= 0.8").select("circuit_key", "session_key")
test_sessions  = unique_sessions_df.filter("rand_val > 0.8").select("circuit_key", "session_key")

# 3. Hacer un join para separar los datos
train_data = encoded_df.join(train_sessions, on=["circuit_key", "session_key"], how="inner")
test_data  = encoded_df.join(test_sessions,  on=["circuit_key", "session_key"], how="inner")

print("Train set count:", train_data.count())
print("Test set count: ", test_data.count())

Train set count: 47457
Test set count:  8916


In [18]:
lr = LinearRegression(featuresCol="features", labelCol="next_lap_time")
lr_model = lr.fit(train_data)

In [19]:
predictions = lr_model.transform(test_data)
evaluator = RegressionEvaluator(
    labelCol="next_lap_time",
    predictionCol="prediction",
    metricName="rmse"
)
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 1.9977244201347477


In [23]:
predictions.where(col("session_type") == "Race").select("session_key","lap_number","driver_number","current_lap_time", "next_lap_time", "prediction", "features").show(10)


+-----------+----------+-------------+--------------------+--------------------+-----------------+--------------------+
|session_key|lap_number|driver_number|    current_lap_time|       next_lap_time|       prediction|            features|
+-----------+----------+-------------+--------------------+--------------------+-----------------+--------------------+
|       9110|         3|            1|77.76700000000000...|77.20300000000000...| 77.9971445561281|(11,[0,1,2,3,4,7]...|
|       9110|         4|            1|77.20300000000000...|77.37100000000000...|77.44727415967628|(11,[0,1,2,3,4,7]...|
|       9110|         5|            1|77.37100000000000...|77.60700000000000...|77.61802895745497|(11,[0,1,2,3,4,7]...|
|       9110|         6|            1|77.60700000000000...|77.56500000000000...|77.85685908818851|(11,[0,1,2,3,4,7]...|
|       9110|         9|            1|77.49500000000000...|77.35900000000000...|77.78847344861995|(11,[0,1,2,3,4,7]...|
|       9110|        10|            1|77

In [21]:
# lr_model.save("path_to_save_model")
