In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression

In [2]:
spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("F1_Lap_Prediction") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.2.20") \
    .config("spark.driver.port", "7078") \
    .config("spark.driver.host", "pyspark") \
    .config("spark.driver.bindAddress", "0.0.0.0") \
    .getOrCreate()


In [3]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [4]:
jdbc_url = "jdbc:postgresql://postgres:5432/mydatabase"
jdbc_properties = {
    "user": "admin",
    "password": "admin_password",
    "driver": "org.postgresql.Driver"
}
table_name = "f1_consolidated.features"
df = spark.read.jdbc(url=jdbc_url, table=table_name, properties=jdbc_properties)

In [5]:
# Limpieza de datos
df = df.na.drop(subset=["current_lap_time", "previous_lap_time", "lap_time_delta", "accumulated_time"])
df = df.filter(df.current_tire != "TEST_UNKNOWN")

In [6]:
# Indexar columna "current tire" para incluírla en el vector ensamblado

indexer = StringIndexer(
    inputCol="current_tire",
    outputCol="current_tire_index"
)

In [7]:
indexerModel = indexer.fit(df)

In [8]:
indexed_df = indexerModel.transform(df)

In [9]:
indexed_df.show()

+--------------------+-----------+-----------+-------------+----------+--------------------+-------------------------+------------+--------------------+---------+--------------------+--------------------+--------------------+----------------+---------------------------+--------------------+--------------------+--------------------+-------------+------------------------+------------+--------------------+------------------+
|                date|circuit_key|session_key|driver_number|lap_number|    current_lap_time|race_percentage_completed|current_tire|laps_on_current_tire|box_stops|   previous_lap_time|      lap_time_delta|    accumulated_time|position_in_race|time_difference_with_leader|       sector_1_time|       sector_2_time|       sector_3_time|box_stop_time|time_since_last_box_stop|session_type|          created_at|current_tire_index|
+--------------------+-----------+-----------+-------------+----------+--------------------+-------------------------+------------+-----------------

In [10]:
encoder = OneHotEncoder(
    inputCols=["current_tire_index"],
    outputCols=["current_tire_vec"]
)

In [11]:
encoded_df = encoder.fit(indexed_df).transform(indexed_df)

In [12]:
encoded_df.show(truncate=False)

+-----------------------+-----------+-----------+-------------+----------+-----------------------+-------------------------+------------+--------------------+---------+-----------------------+------------------------+-----------------------+----------------+---------------------------+-----------------------+-----------------------+-----------------------+-------------+------------------------+------------+--------------------------+------------------+----------------+
|date                   |circuit_key|session_key|driver_number|lap_number|current_lap_time       |race_percentage_completed|current_tire|laps_on_current_tire|box_stops|previous_lap_time      |lap_time_delta          |accumulated_time       |position_in_race|time_difference_with_leader|sector_1_time          |sector_2_time          |sector_3_time          |box_stop_time|time_since_last_box_stop|session_type|created_at                |current_tire_index|current_tire_vec|
+-----------------------+-----------+-----------+---

In [13]:
assembler = VectorAssembler(
    inputCols=["previous_lap_time", "lap_time_delta", "accumulated_time", "laps_on_current_tire", "box_stops", "current_tire_vec"],
    outputCol="features"
)
encoded_df = assembler.transform(encoded_df)

In [14]:
# Visualizamos algunas filas para verificar
encoded_df.select("current_tire", "current_tire_index", "current_tire_vec", "features").show(10)

+------------+------------------+----------------+--------------------+
|current_tire|current_tire_index|current_tire_vec|            features|
+------------+------------------+----------------+--------------------+
|      MEDIUM|               0.0|   (5,[0],[1.0])|(10,[0,1,2,3,5],[...|
|      MEDIUM|               0.0|   (5,[0],[1.0])|(10,[0,1,2,3,5],[...|
|      MEDIUM|               0.0|   (5,[0],[1.0])|(10,[0,1,2,3,5],[...|
|      MEDIUM|               0.0|   (5,[0],[1.0])|(10,[0,1,2,3,5],[...|
|      MEDIUM|               0.0|   (5,[0],[1.0])|(10,[0,1,2,3,5],[...|
|      MEDIUM|               0.0|   (5,[0],[1.0])|(10,[0,1,2,3,5],[...|
|      MEDIUM|               0.0|   (5,[0],[1.0])|(10,[0,1,2,3,5],[...|
|        SOFT|               1.0|   (5,[1],[1.0])|(10,[0,1,2,3,6],[...|
|      MEDIUM|               0.0|   (5,[0],[1.0])|(10,[0,1,2,3,5],[...|
|      MEDIUM|               0.0|   (5,[0],[1.0])|(10,[0,1,2,3,5],[...|
+------------+------------------+----------------+--------------

In [15]:
train_data, test_data = encoded_df.randomSplit([0.8, 0.2], seed=99)


In [16]:
lr = LinearRegression(featuresCol="features", labelCol="current_lap_time")
lr_model = lr.fit(train_data)

In [17]:
predictions = lr_model.transform(test_data)
evaluator = RegressionEvaluator(
    labelCol="current_lap_time",
    predictionCol="prediction",
    metricName="rmse"
)
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 0.00020439707673612013


In [18]:
predictions.select("current_lap_time", "prediction", "features").show(10)


+--------------------+------------------+--------------------+
|    current_lap_time|        prediction|            features|
+--------------------+------------------+--------------------+
|127.8930000000000...|127.89299703797752|(10,[0,1,2,3,6],[...|
|98.32300000000000...| 98.32300029280648|(10,[0,1,2,3,7],[...|
|1678.930000000000...| 1678.927895660881|(10,[0,1,2,3,6],[...|
|98.77400000000000...| 98.77404399518154|(10,[0,1,2,3,5],[...|
|98.65100000000000...| 98.65100288589494|(10,[0,1,2,3,7],[...|
|125.2620000000000...| 125.2619610229797|(10,[0,1,2,3,9],[...|
|96.82000000000000...| 96.81999808591506|(10,[0,1,2,3,5],[...|
|94.89500000000000...| 94.89499488437501|(10,[0,1,2,3,5],[...|
|817.3780000000000...| 817.3779733580088|(10,[0,1,2,3,7],[...|
|98.58200000000000...| 98.58199898870245|(10,[0,1,2,3,5],[...|
+--------------------+------------------+--------------------+
only showing top 10 rows



In [None]:
lr_model.save("path_to_save_model")
