In [1]:
from pyspark.sql.functions import col, split, concat_ws, when, size, min, trim, round, mean, lit, max

In [2]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [28]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("TrackTimePrediction") \
    .getOrCreate()

spark

25/06/14 12:24:54 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [27]:
spark.stop()

Machine learning time

In [29]:
file_path1 = "./final_database/merged_five_thresh.csv"
df = spark.read.csv(file_path1, header=True, inferSchema=True)
df = df.filter(col("PR_seconds") >= 700)
print(df.count())
df.show(1)

3675
+--------------+---+----------+--------------------+-----+-----+------+-----+-----+-----------------+-----------------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+-------+-----------------+
|  athlete_name|_c0|Unnamed: 0|      Athlete/School|Grade|State|Gender|FR_5k|SO_5k|            JR_5k|            SR_5k|FR_3200|SO_3200|JR_3200|SR_3200|FR_1600|SO_1600|JR_1600|SR_1600|FR_800|SO_800|JR_800|SR_800|     id|       PR_seconds|
+--------------+---+----------+--------------------+-----+-----+------+-----+-----+-----------------+-----------------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+-------+-----------------+
|Aakersh Mathur| 16|        16|Aakersh Mathur Mo...| 2021|   CA|  Boys| NULL| NULL|961.5166666666668|952.7333333333336|   NULL|  600.4|   NULL|  559.4|   NULL|   NULL|  269.0|  260.7|  NULL|  NULL|  NULL| 123.9|8073383|919.0499992370605|
+--------------+---+----------+------------

To deal with NULLs, will try imputing with an indicator column, and simply setting the NULLs to a very large value. The first is more robust, but I worry about setting nulls to the mean since I am hoping to capture trends

In [30]:
df = df.fillna(9999999, subset=['FR_5k', 'SO_5k', 'JR_5k', 'SR_5k', 'FR_3200', 'SO_3200', 'JR_3200', 'SR_3200', 'FR_1600', 'SO_1600', 'JR_1600', 'SR_1600', 'FR_800', 'SO_800', 'JR_800', 'SR_800'])
df.show(5)

+----------------+---+----------+--------------------+-----+-----+------+---------+-----------------+-----------------+------------------+---------+---------+---------+-------+---------+---------+---------+-------+---------+---------+---------+---------+-------+------------------+
|    athlete_name|_c0|Unnamed: 0|      Athlete/School|Grade|State|Gender|    FR_5k|            SO_5k|            JR_5k|             SR_5k|  FR_3200|  SO_3200|  JR_3200|SR_3200|  FR_1600|  SO_1600|  JR_1600|SR_1600|   FR_800|   SO_800|   JR_800|   SR_800|     id|        PR_seconds|
+----------------+---+----------+--------------------+-----+-----+------+---------+-----------------+-----------------+------------------+---------+---------+---------+-------+---------+---------+---------+-------+---------+---------+---------+---------+-------+------------------+
|  Aakersh Mathur| 16|        16|Aakersh Mathur Mo...| 2021|   CA|  Boys|9999999.0|        9999999.0|961.5166666666668| 952.7333333333336|9999999.0|    60

In [31]:
feature_cols = ['FR_5k', 'SO_5k', 'JR_5k', 'SR_5k', 'FR_3200', 'SO_3200', 'JR_3200', 'SR_3200', 'FR_1600', 'SO_1600', 'JR_1600', 'SR_1600', 'FR_800', 'SO_800', 'JR_800', 'SR_800']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid='keep')
model_df = assembler.transform(df)
model_df.select('features', 'PR_seconds').show(5, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|features                                                                                                                                                     |PR_seconds        |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|[9999999.0,9999999.0,961.5166666666668,952.7333333333336,9999999.0,600.4,9999999.0,559.4,9999999.0,9999999.0,269.0,260.7,9999999.0,9999999.0,9999999.0,123.9]|919.0499992370605 |
|[9999999.0,985.8,926.9,915.0166666666668,9999999.0,597.0,574.8,567.8,9999999.0,9999999.0,267.6,264.6,9999999.0,9999999.0,9999999.0,9999999.0]                |873.5900001525879 |
|[9999999.0,9999999.0,9999999.0,956.8666666666668,9999999.0,615.7,602.8,564.7,9999999.0,9999999.0,9999999

In [32]:
train_data, test_data = model_df.randomSplit(weights=[0.8, 0.2], seed=27)
print(f"Training Dataset Count: {train_data.count()}")
print(f"Test Dataset Count: {test_data.count()}")

Training Dataset Count: 2914
Test Dataset Count: 761


In [33]:
gbt = GBTRegressor(featuresCol='features', labelCol='PR_seconds')
print("Training the GBT model...")
gbt_model = gbt.fit(train_data)
print("Training complete.")

Training the GBT model...
Training complete.


In [34]:
predictions = gbt_model.transform(test_data)
predictions.select("PR_seconds", "prediction").show(10)

+-----------------+-----------------+
|       PR_seconds|       prediction|
+-----------------+-----------------+
|917.2800006866455|929.2156087794947|
|919.0499992370605|899.3989735109267|
|909.7799997329712|911.3758098588826|
|940.8600006103516|922.3412881672614|
|953.8800010681152|937.4624097182023|
|955.7400016784668|977.1422265497764|
|876.2299995422363|874.8827581131434|
|859.1499996185303|873.0455639779069|
|898.8899993896484|880.6333919090546|
|893.9300003051758| 947.107842502953|
+-----------------+-----------------+
only showing top 10 rows


In [35]:
evaluator = RegressionEvaluator(
    labelCol="PR_seconds",
    predictionCol="prediction",
    metricName="rmse"  
)
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {rmse:.2f} seconds")
r2_evaluator = evaluator.setMetricName("r2")
r2 = r2_evaluator.evaluate(predictions)
print(f"R-squared (R2) on test data = {r2:.2f}")


Root Mean Squared Error (RMSE) on test data = 46.91 seconds
R-squared (R2) on test data = 0.27


This model sucks. It is on average 62.94 seconds off. Will try to impute with indicators next. Then add more features like progression, but realistically, I just don't have enough data yet to make a broad enough model to account for all the factors at play. Will also try using the dataset with a lower threshold. (After removing the 100s 5k times, RMSE=49.50 seconds)

Using 5 thresh, RMSE = 46.91

In [20]:
max_val = df.select(max(col("SR_800"))).first()[0]
df = df.withColumn(
    "SR_800_was_missing",
    when(col("SR_800").isNull(), 1).otherwise(0)
)
df = df.fillna(max_val, subset=["SR_800"])
df.show(1)

+-------------+-----+----------+--------------------+-----+-----+------+------+------+------+------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+-------+-----------------+-----------------+-----------------+-----------------+-----------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+------------------+
| athlete_name|  _c0|Unnamed: 0|      Athlete/School|Grade|State|Gender| FR_5k| SO_5k| JR_5k| SR_5k|FR_3200|SO_3200|JR_3200|SR_3200|FR_1600|SO_1600|JR_1600|SR_1600|FR_800|SO_800|JR_800|SR_800|     id|       PR_seconds|FR_5k_was_missing|SO_5k_was_missing|JR_5k_was_missing|SR_5k_was_missing|FR_3200_was_missing|SO_3200_was_missing|JR_3200_was_missing|SR_3200_was_missing|FR_1600_was_missing|SO_1600_was_missing|JR_1600_was_missing|SR_1600_was_missing|FR_800_was_missing|SO_8

In [26]:
output_path = './final_database'
df.coalesce(1).write \
    .option('header', 'true') \
    .mode('append') \
    .csv(output_path) 

In [21]:
feature_cols = ['FR_5k', 'SO_5k', 'JR_5k', 'SR_5k', 'FR_3200', 'SO_3200', 'JR_3200', 'SR_3200', 'FR_1600', 'SO_1600', 'JR_1600', 'SR_1600', 'FR_800', 'SO_800', 'JR_800', 'SR_800', 'FR_5k_was_missing', 'SO_5k_was_missing', 'JR_5k_was_missing', 'SR_5k_was_missing', 'FR_3200_was_missing', 'SO_3200_was_missing', 'JR_3200_was_missing', 'SR_3200_was_missing', 'FR_1600_was_missing', 'SO_1600_was_missing', 'JR_1600_was_missing', 'SR_1600_was_missing', 'FR_800_was_missing', 'SO_800_was_missing', 'JR_800_was_missing', 'SR_800_was_missing']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid='keep')
df = df.filter(col("PR_seconds") >= 700)
model_df = assembler.transform(df)
model_df.select('features', 'PR_seconds').show(5, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|features                                                                                                                                                                                    |PR_seconds        |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|[1153.1,1223.6,1093.4,1037.4,782.5,683.8,707.1,623.9,331.5,303.4,294.6,287.2,144.1,138.5,138.9,126.5,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0]                       |932.5400009155273 |
|(32,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,29,31],[984.3,995.1,978.2,941.3,585.8,594.9,557.2,573.5,269.6,268.8,257.3,268.7,127.1,140.3,118.4,135.9,1.0,1.0])   

In [22]:
train_data, test_data = model_df.randomSplit(weights=[0.8, 0.2], seed=27)
print(f"Training Dataset Count: {train_data.count()}")
print(f"Test Dataset Count: {test_data.count()}")

Training Dataset Count: 1674
Test Dataset Count: 430


In [23]:
gbt = GBTRegressor(featuresCol='features', labelCol='PR_seconds')
print("Training the GBT model...")
gbt_model = gbt.fit(train_data)
print("Training complete.")

Training the GBT model...
Training complete.


In [24]:
predictions = gbt_model.transform(test_data)
predictions.select("PR_seconds", "prediction").show(20)

+-----------------+-----------------+
|       PR_seconds|       prediction|
+-----------------+-----------------+
|941.0400009155273|972.0293763635137|
|959.6399993896484| 954.321076293701|
|923.2700004577637|885.3536942310076|
|830.5099983215332|900.0437968598303|
|859.1499996185303|884.8728660098963|
|861.7199993133545|907.1250123096258|
|898.8899993896484|870.3778061449188|
|853.1199998855591|898.4769341416701|
| 953.560001373291|952.4025915209158|
|          1032.25|895.1001816088304|
|895.4799995422363|891.7858613886225|
|879.1500015258789|938.9552115966691|
|916.7800006866455|  916.76716748641|
|869.8500003814697|892.2175248938125|
|865.8600006103516|881.2351022432816|
|878.4000015258789| 856.853246470625|
|862.3500003814697|822.8055194531361|
|937.2700004577637|954.9042800982579|
|993.7700004577637|946.2786449783655|
|896.4000015258789|882.4368087909785|
+-----------------+-----------------+
only showing top 20 rows


25/06/14 11:59:41 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [25]:
evaluator = RegressionEvaluator(
    labelCol="PR_seconds",
    predictionCol="prediction",
    metricName="rmse"  
)
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {rmse:.2f} seconds")
r2_evaluator = evaluator.setMetricName("r2")
r2 = r2_evaluator.evaluate(predictions)
print(f"R-squared (R2) on test data = {r2:.2f}")

Root Mean Squared Error (RMSE) on test data = 49.28 seconds
R-squared (R2) on test data = 0.19


Imputing with the mean and marking which rows were missing was worse: 74.15 seconds off on average. Will test the other data set next and add new features soon. (I accidentally had some 800m results as 5k times, new RMSE = 49.93). 49.28 using MAX instead of mean.