In [307]:
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession
from pyspark.sql.functions import expm1
from pyspark.sql.functions import log1p
from pyspark.sql.functions import col
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.sql.functions import sin
from pyspark.sql.functions import cos
from pyspark.sql.functions import monotonically_increasing_id
import math
from xgboost.spark import SparkXGBRegressor

In [308]:
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .config("spark.sql.parquet.enableVectorizedReader", "false")
    .getOrCreate()
)

In [309]:
data = spark.read.parquet('../data/combined_data/')
data.columns

['hourly_timestamp',
 'PULocationID',
 'pickup_hour_of_day',
 'pickup_day_of_week',
 'pickup_month',
 'pickup_borough',
 'is_weekend',
 'pickup_at_airport',
 'num_trips',
 'pickup_num_businesses',
 'temperature_2m',
 'relative_humidity_2m',
 'rain',
 'snowfall',
 'wind_speed_10m']

In [310]:
data = data.withColumn("log_num_trips", log1p("num_trips"))

In [311]:
# data = data.filter(col("PULocationID")== 102)
# data = data.filter(col("pickup_borough") == "Manhattan")

In [312]:
train_data = data.filter(data['pickup_month'] <= 5)  # Months 1-5 for training
test_data = data.filter(data['pickup_month'] > 5)  # Month 6 for testing

In [313]:
# Index and One-Hot Encode categorical features if necessary
indexers = [
    StringIndexer(inputCol=column, outputCol=f"{column}_index").setHandleInvalid("keep")
    for column in ['pickup_day_of_week']
]

encoders = [
    OneHotEncoder(inputCol=f"{column}_index", outputCol=f"{column}_ohe").setHandleInvalid("keep")
    for column in ['pickup_day_of_week']
]

In [314]:
assembler = VectorAssembler(
    inputCols=[
        'pickup_hour_of_day',
        'pickup_day_of_week_ohe',
        'is_weekend',
        "pickup_at_airport",
        'pickup_num_businesses',
        'temperature_2m',
        'relative_humidity_2m',
        'rain',
        'snowfall',
        'wind_speed_10m'
    ],
    outputCol="features"
).setHandleInvalid("keep")

In [315]:
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

In [316]:
lr = LinearRegression(featuresCol='scaled_features', labelCol='log_num_trips', elasticNetParam=1)

In [317]:
pipeline_lr = Pipeline(stages=indexers + encoders + [assembler, scaler, lr])

In [318]:
model = pipeline_lr.fit(train_data)

24/08/22 12:53:50 WARN Instrumentation: [3deb1c8c] regParam is zero, which might cause numerical instability and overfitting.
24/08/22 12:53:52 WARN Instrumentation: [3deb1c8c] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
                                                                                

In [319]:
model.stages[-1].coefficients

DenseVector([0.3516, -0.0033, -0.0296, -0.0324, 0.0357, 0.0453, 0.0034, -0.0186, 0.0, 0.0251, 0.2007, 0.5941, 0.0158, 0.0132, -0.0042, -0.0203, -0.0113])

In [320]:
predictions = model.transform(test_data)
predictions = predictions.withColumn("lr_prediction", expm1(predictions["prediction"]))

In [321]:
evaluator = RegressionEvaluator(
    labelCol="num_trips", predictionCol="lr_prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")
print(f"R2 on test data = {r2}")

[Stage 496:>                                                        (0 + 8) / 8]

Root Mean Squared Error (RMSE) on test data = 108.75150745926629
R2 on test data = 0.025550634897025026


                                                                                

In [322]:
errors = predictions.select('pickup_hour_of_day', 'pickup_day_of_week', 'pickup_month', 'num_trips', 'lr_prediction',
                            'pickup_num_businesses', "PULocationID")

In [323]:
xgb_regressor = SparkXGBRegressor(
    features_col="scaled_features",
    label_col="log_num_trips",
    num_workers=2,
)

In [324]:
pipeline_xgb = Pipeline(stages=indexers + encoders + [assembler, scaler, xgb_regressor])

In [325]:
model = pipeline_xgb.fit(train_data)

2024-08-22 12:53:58,199 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 2 workers with
	booster params: {'objective': 'reg:squarederror', 'device': 'cpu', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-22 12:54:02,806 INFO XGBoost-PySpark: _train_booster Training on CPUs 2]
[12:54:03] Task 0 got rank 0
[12:54:03] Task 1 got rank 1
2024-08-22 12:54:07,625 INFO XGBoost-PySpark: _fit Finished xgboost training!   


In [326]:
predictions = model.transform(test_data)
predictions = predictions.withColumn("xgb_prediction", expm1(predictions["prediction"]))

In [327]:
evaluator = RegressionEvaluator(
    labelCol="num_trips", predictionCol="xgb_prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")
print(f"R2 on test data = {r2}")

2024-08-22 12:54:10,314 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-08-22 12:54:10,965 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
[Stage 509:>                                                        (0 + 8) / 8]

Root Mean Squared Error (RMSE) on test data = 45.34797838647364
R2 on test data = 0.8305644379452911


                                                                                

In [328]:
# Add XGBoost predictions to errors dataframe based on index
errors = errors.withColumn("index", monotonically_increasing_id())
predictions = predictions.withColumn("index", monotonically_increasing_id())
errors = errors.join(predictions.select("index", "xgb_prediction"), "index", "left").drop("index")

In [329]:
decision_tree_regressor = DecisionTreeRegressor(
    featuresCol="scaled_features",
    labelCol="log_num_trips"
)

In [330]:
pipeline_dt = Pipeline(stages=indexers + encoders + [assembler, scaler, decision_tree_regressor])

In [331]:
model = pipeline_dt.fit(train_data)

                                                                                

In [332]:
predictions = model.transform(test_data)
predictions = predictions.withColumn("dt_prediction", expm1(predictions["prediction"]))

In [333]:
evaluator = RegressionEvaluator(
    labelCol="num_trips", predictionCol="dt_prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")
print(f"R2 on test data = {r2}")

Root Mean Squared Error (RMSE) on test data = 90.82584896577947
R2 on test data = 0.3203150749645355


In [334]:
# Add Decision Tree Regressor predictions to errors dataframe based on index
errors = errors.withColumn("index", monotonically_increasing_id())
predictions = predictions.withColumn("index", monotonically_increasing_id())
errors = errors.join(predictions.select("index", "dt_prediction"), "index", "left").drop("index")

In [335]:
errors.write.mode('overwrite').parquet('../data/errors/')

2024-08-22 12:54:28,984 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
                                                                                