In [1]:
from pyspark.sql.functions import col, split, concat_ws, when, size, min, trim, round, mean, lit

In [2]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("TrackTimePrediction") \
    .getOrCreate()

spark

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/13 18:00:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
spark.stop()

Machine learning time

In [34]:
file_path1 = "./final_database/merged_half_thresh.csv"
df = spark.read.csv(file_path1, header=True, inferSchema=True)
df = df.filter(col("PR_seconds") >= 700)
print(df.count())
df.show(1)

2104
+-------------+-----+----------+--------------------+-----+-----+------+-----+-----+------+------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+-------+-----------------+
| athlete_name|  _c0|Unnamed: 0|      Athlete/School|Grade|State|Gender|FR_5k|SO_5k| JR_5k| SR_5k|FR_3200|SO_3200|JR_3200|SR_3200|FR_1600|SO_1600|JR_1600|SR_1600|FR_800|SO_800|JR_800|SR_800|     id|       PR_seconds|
+-------------+-----+----------+--------------------+-----+-----+------+-----+-----+------+------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+-------+-----------------+
|Camron Gaddis|66230|      3024|Camron Gaddis Col...| 2022|   GA|  Boys| NULL| NULL|1093.4|1037.4|   NULL|  683.8|   NULL|  623.9|   NULL|  303.4|  294.6|  287.2|  NULL| 138.5|  NULL| 126.5|8265308|932.5400009155273|
+-------------+-----+----------+--------------------+-----+-----+------+-----+-----+------+------+-------+-------+-------+-----

To deal with NULLs, will try imputing with an indicator column, and simply setting the NULLs to a very large value. The first is more robust, but I worry about setting nulls to the mean since I am hoping to capture trends

In [35]:
df = df.fillna(9999999, subset=['FR_5k', 'SO_5k', 'JR_5k', 'SR_5k', 'FR_3200', 'SO_3200', 'JR_3200', 'SR_3200', 'FR_1600', 'SO_1600', 'JR_1600', 'SR_1600', 'FR_800', 'SO_800', 'JR_800', 'SR_800'])
df.show(5)

+-------------+-----+----------+--------------------+-----+-----+------+---------+---------+-----------------+-----------------+---------+-------+---------+-------+---------+---------+---------+-------+---------+---------+---------+---------+-------+------------------+
| athlete_name|  _c0|Unnamed: 0|      Athlete/School|Grade|State|Gender|    FR_5k|    SO_5k|            JR_5k|            SR_5k|  FR_3200|SO_3200|  JR_3200|SR_3200|  FR_1600|  SO_1600|  JR_1600|SR_1600|   FR_800|   SO_800|   JR_800|   SR_800|     id|        PR_seconds|
+-------------+-----+----------+--------------------+-----+-----+------+---------+---------+-----------------+-----------------+---------+-------+---------+-------+---------+---------+---------+-------+---------+---------+---------+---------+-------+------------------+
|Camron Gaddis|66230|      3024|Camron Gaddis Col...| 2022|   GA|  Boys|9999999.0|9999999.0|           1093.4|           1037.4|9999999.0|  683.8|9999999.0|  623.9|9999999.0|    303.4|    29

In [36]:
feature_cols = ['FR_5k', 'SO_5k', 'JR_5k', 'SR_5k', 'FR_3200', 'SO_3200', 'JR_3200', 'SR_3200', 'FR_1600', 'SO_1600', 'JR_1600', 'SR_1600', 'FR_800', 'SO_800', 'JR_800', 'SR_800']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid='keep')
model_df = assembler.transform(df)
model_df.select('features', 'PR_seconds').show(5, truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|features                                                                                                                                                   |PR_seconds        |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|[9999999.0,9999999.0,1093.4,1037.4,9999999.0,683.8,9999999.0,623.9,9999999.0,303.4,294.6,287.2,9999999.0,138.5,9999999.0,126.5]                            |932.5400009155273 |
|[984.3,995.1,978.2,941.3,585.8,594.9,557.2,573.5,269.6,268.8,257.3,268.7,127.1,9999999.0,118.4,9999999.0]                                                  |872.3899993896484 |
|[9999999.0,1012.77,956.8666666666668,933.9266666666666,638.2,589.3,575.9,567.9,9999999.0,9999999.0,9999999.0,262.2

In [37]:
train_data, test_data = model_df.randomSplit(weights=[0.8, 0.2], seed=27)
print(f"Training Dataset Count: {train_data.count()}")
print(f"Test Dataset Count: {test_data.count()}")

Training Dataset Count: 1674
Test Dataset Count: 430


In [38]:
gbt = GBTRegressor(featuresCol='features', labelCol='PR_seconds')
print("Training the GBT model...")
gbt_model = gbt.fit(train_data)
print("Training complete.")

Training the GBT model...
Training complete.


In [39]:
predictions = gbt_model.transform(test_data)
predictions.select("PR_seconds", "prediction").show(10)

+-----------------+-----------------+
|       PR_seconds|       prediction|
+-----------------+-----------------+
|941.0400009155273|972.2497672368311|
|959.6399993896484|954.3213327322514|
|923.2700004577637|883.7600439040624|
|830.5099983215332|900.0275340992043|
|859.1499996185303|884.8728660098963|
|861.7199993133545|907.1132864650324|
|898.8899993896484|869.9006605862978|
|853.1199998855591|898.4769341416701|
| 953.560001373291|952.4027269978859|
|          1032.25|895.0839188482045|
+-----------------+-----------------+
only showing top 10 rows


In [40]:
evaluator = RegressionEvaluator(
    labelCol="PR_seconds",
    predictionCol="prediction",
    metricName="rmse"  
)
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {rmse:.2f} seconds")
r2_evaluator = evaluator.setMetricName("r2")
r2 = r2_evaluator.evaluate(predictions)
print(f"R-squared (R2) on test data = {r2:.2f}")


Root Mean Squared Error (RMSE) on test data = 49.50 seconds
R-squared (R2) on test data = 0.18


This model sucks. It is on average 62.94 seconds off. Will try to impute with indicators next. Then add more features like progression, but realistically, I just don't have enough data yet to make a broad enough model to account for all the factors at play. Will also try using the dataset with a lower threshold. (After removing the 100s 5k times, RMSE=49.50 seconds)

In [20]:
mean_val = df.select(mean(col("SR_800"))).first()[0]
df = df.withColumn(
    "SR_800_was_missing",
    when(col("SR_800").isNull(), 1).otherwise(0)
)
df = df.fillna(mean_val, subset=["SR_800"])
df.show(1)

+-------------+-----+----------+--------------------+-----+-----+------+------------------+----------------+------+------+-----------------+-------+-----------------+-------+-----------------+-------+-------+-------+-----------------+------+------------------+------+-------+-----------------+-----------------+-----------------+-----------------+-----------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+------------------+
| athlete_name|  _c0|Unnamed: 0|      Athlete/School|Grade|State|Gender|             FR_5k|           SO_5k| JR_5k| SR_5k|          FR_3200|SO_3200|          JR_3200|SR_3200|          FR_1600|SO_1600|JR_1600|SR_1600|           FR_800|SO_800|            JR_800|SR_800|     id|       PR_seconds|FR_5k_was_missing|SO_5k_was_missing|JR_5k_was_missing|SR_5k_was_missing|FR_3200_was_missing|SO_3200_was_m

In [21]:
output_path = './final_database'
df.coalesce(1).write \
    .option('header', 'true') \
    .mode('append') \
    .csv(output_path) 

In [28]:
feature_cols = ['FR_5k', 'SO_5k', 'JR_5k', 'SR_5k', 'FR_3200', 'SO_3200', 'JR_3200', 'SR_3200', 'FR_1600', 'SO_1600', 'JR_1600', 'SR_1600', 'FR_800', 'SO_800', 'JR_800', 'SR_800', 'FR_5k_was_missing', 'SO_5k_was_missing', 'JR_5k_was_missing', 'SR_5k_was_missing', 'FR_3200_was_missing', 'SO_3200_was_missing', 'JR_3200_was_missing', 'SR_3200_was_missing', 'FR_1600_was_missing', 'SO_1600_was_missing', 'JR_1600_was_missing', 'SR_1600_was_missing', 'FR_800_was_missing', 'SO_800_was_missing', 'JR_800_was_missing', 'SR_800_was_missing']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid='keep')
df = df.filter(col("PR_seconds") >= 700)
model_df = assembler.transform(df)
model_df.select('features', 'PR_seconds').show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|features                                                                                                                                                                                                                                                                                        |PR_seconds        |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|[1020.0831104356615,998.493021612634,1093.4,1037.4,624.0475238095241,

In [29]:
train_data, test_data = model_df.randomSplit(weights=[0.8, 0.2], seed=27)
print(f"Training Dataset Count: {train_data.count()}")
print(f"Test Dataset Count: {test_data.count()}")

Training Dataset Count: 1674
Test Dataset Count: 430


In [30]:
gbt = GBTRegressor(featuresCol='features', labelCol='PR_seconds')
print("Training the GBT model...")
gbt_model = gbt.fit(train_data)
print("Training complete.")

Training the GBT model...
Training complete.


In [31]:
predictions = gbt_model.transform(test_data)
predictions.select("PR_seconds", "prediction").show(20)

+-----------------+-----------------+
|       PR_seconds|       prediction|
+-----------------+-----------------+
|941.0400009155273| 977.333144883867|
|959.6399993896484| 928.208166833752|
|923.2700004577637|893.8190946081899|
|830.5099983215332|860.7848673273243|
|859.1499996185303|889.2774269624757|
|861.7199993133545| 930.300700889342|
|898.8899993896484|874.7672352746899|
|853.1199998855591|903.1277896541882|
| 953.560001373291|934.2828194711383|
|          1032.25|898.5539524793979|
|895.4799995422363| 856.389278438852|
|879.1500015258789| 894.046618535767|
|916.7800006866455|952.2994359247733|
|869.8500003814697|901.9905071474188|
|865.8600006103516|894.1194575416987|
|878.4000015258789|865.9630206378258|
|862.3500003814697|818.4050534173716|
|937.2700004577637|931.6598153699335|
|993.7700004577637|934.7215854978413|
|896.4000015258789|880.9584785200562|
+-----------------+-----------------+
only showing top 20 rows


In [32]:
evaluator = RegressionEvaluator(
    labelCol="PR_seconds",
    predictionCol="prediction",
    metricName="rmse"  
)
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {rmse:.2f} seconds")
r2_evaluator = evaluator.setMetricName("r2")
r2 = r2_evaluator.evaluate(predictions)
print(f"R-squared (R2) on test data = {r2:.2f}")

Root Mean Squared Error (RMSE) on test data = 49.93 seconds
R-squared (R2) on test data = 0.16


Imputing with the mean and marking which rows were missing was worse: 74.15 seconds off on average. Will test the other data set next and add new features soon. (I accidentally had some 800m results as 5k times, new RMSE = 49.93)