In [85]:
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession
from pyspark.sql.functions import expm1
from pyspark.sql.functions import log1p
from pyspark.sql.functions import monotonically_increasing_id
from xgboost.spark import SparkXGBRegressor

In [86]:
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .config("spark.sql.parquet.enableVectorizedReader", "false")
    .getOrCreate()
)

In [87]:
data = spark.read.parquet('../data/curated/')
data.columns

['hourly_timestamp',
 'PULocationID',
 'pickup_hour_of_day',
 'pickup_day_of_week',
 'pickup_month',
 'pickup_borough',
 'pickup_at_airport',
 'num_trips',
 'pickup_num_businesses',
 'temperature_2m',
 'relative_humidity_2m',
 'rain',
 'snowfall',
 'wind_speed_10m']

Since the number of trips is a highly skewed count variable, we will use the log of the number of trips as the target variable. We will also add 1 to the number of trips before taking the log to avoid taking the log of 0.

In [88]:
data = data.withColumn("log_num_trips", log1p("num_trips"))

In [89]:
train_data = data.filter(data['pickup_month'] <= 5)  # Months 1-5 for training
test_data = data.filter(data['pickup_month'] > 5)  # Month 6 for testing

In [90]:
# Index and One-Hot Encode categorical feature
indexers = [
    StringIndexer(inputCol=column, outputCol=f"{column}_index").setHandleInvalid("keep")
    for column in ['pickup_day_of_week', 'pickup_borough']
]

encoders = [
    OneHotEncoder(inputCol=f"{column}_index", outputCol=f"{column}_ohe").setHandleInvalid("keep")
    for column in ['pickup_day_of_week', 'pickup_borough']
]

In [91]:
assembler = VectorAssembler(
    inputCols=[
        'pickup_hour_of_day',
        "pickup_at_airport",
        'pickup_num_businesses',
        'temperature_2m',
        'relative_humidity_2m',
        'rain',
        'snowfall',
        'wind_speed_10m',
        'pickup_day_of_week_ohe',
        'pickup_borough_ohe'
    ],
    outputCol="features"
).setHandleInvalid("keep")

In [92]:
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

## Linear Regression
with Lasso Regularization

In [93]:
lr = LinearRegression(featuresCol='scaled_features', labelCol='log_num_trips', elasticNetParam=1)

In [94]:
pipeline_lr = Pipeline(stages=indexers + encoders + [assembler, scaler, lr])

In [95]:
model = pipeline_lr.fit(train_data)

24/08/25 01:25:07 WARN Instrumentation: [c8506234] regParam is zero, which might cause numerical instability and overfitting.
24/08/25 01:25:09 WARN Instrumentation: [c8506234] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
                                                                                

In [96]:
# Print the columns and their corresponding coefficients
model.stages[-1].coefficients

DenseVector([0.3536, 0.2348, 0.5884, 0.016, 0.0135, -0.0045, -0.0206, -0.0115, 0.011, -0.0354, -0.0385, 0.0501, 0.04, -0.0025, -0.0245, 0.0, -0.1609, 0.3033, 0.0138, 0.0246, -0.2976, 0.0])

In [97]:
predictions = model.transform(test_data)
predictions = predictions.withColumn("lr_prediction", expm1(predictions["prediction"]))

In [98]:
evaluator = RegressionEvaluator(
    labelCol="num_trips", predictionCol="lr_prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")
print(f"R2 on test data = {r2}")

                                                                                

Root Mean Squared Error (RMSE) on test data = 91.8940812998167
R2 on test data = 0.30423306429577246


                                                                                

In [99]:
errors = predictions.select('pickup_hour_of_day', 'pickup_day_of_week', 'pickup_month', 'num_trips', 'lr_prediction',
                            'pickup_num_businesses', "PULocationID", 'pickup_borough')

## XGBoost

In [100]:
xgb_regressor = SparkXGBRegressor(
    features_col="scaled_features",
    label_col="log_num_trips",
    num_workers=2,
)

In [101]:
pipeline_xgb = Pipeline(stages=indexers + encoders + [assembler, scaler, xgb_regressor])

In [102]:
model = pipeline_xgb.fit(train_data)

2024-08-25 01:25:17,679 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 2 workers with
	booster params: {'objective': 'reg:squarederror', 'device': 'cpu', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-25 01:25:22,589 INFO XGBoost-PySpark: _train_booster Training on CPUs 2]
[01:25:23] Task 0 got rank 0
[01:25:23] Task 1 got rank 1
2024-08-25 01:25:27,678 INFO XGBoost-PySpark: _fit Finished xgboost training!   


In [103]:
predictions = model.transform(test_data)
predictions = predictions.withColumn("xgb_prediction", expm1(predictions["prediction"]))

In [104]:
evaluator = RegressionEvaluator(
    labelCol="num_trips", predictionCol="xgb_prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")
print(f"R2 on test data = {r2}")

2024-08-25 01:25:31,745 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-08-25 01:25:32,843 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
[Stage 254:>                                                        (0 + 8) / 8]

Root Mean Squared Error (RMSE) on test data = 42.471751332779114
R2 on test data = 0.8513759659990753


                                                                                

In [105]:
# Add XGBoost predictions to errors dataframe based on index
errors = errors.withColumn("index", monotonically_increasing_id())
predictions = predictions.withColumn("index", monotonically_increasing_id())
errors = errors.join(predictions.select("index", "xgb_prediction"), "index", "left").drop("index")

## Decision Tree Regressor

In [106]:
decision_tree_regressor = DecisionTreeRegressor(
    featuresCol="scaled_features",
    labelCol="log_num_trips"
)

In [107]:
pipeline_dt = Pipeline(stages=indexers + encoders + [assembler, scaler, decision_tree_regressor])

In [108]:
model = pipeline_dt.fit(train_data)

                                                                                

In [109]:
predictions = model.transform(test_data)
predictions = predictions.withColumn("dt_prediction", expm1(predictions["prediction"]))

In [110]:
evaluator = RegressionEvaluator(
    labelCol="num_trips", predictionCol="dt_prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")
print(f"R2 on test data = {r2}")

                                                                                

Root Mean Squared Error (RMSE) on test data = 85.71769970948638
R2 on test data = 0.39461769641560984


                                                                                

In [111]:
# Add Decision Tree Regressor predictions to errors dataframe based on index
errors = errors.withColumn("index", monotonically_increasing_id())
predictions = predictions.withColumn("index", monotonically_increasing_id())
errors = errors.join(predictions.select("index", "dt_prediction"), "index", "left").drop("index")

In [112]:
# Write errors dataframe to parquet for further analysis
errors.write.mode('overwrite').parquet('../data/errors/')

2024-08-25 01:25:58,047 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
                                                                                