In [80]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, hour
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import log1p
from pyspark.sql.functions import avg

In [81]:
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .config("spark.sql.parquet.enableVectorizedReader","false")
    .getOrCreate()
)

In [82]:
df = spark.read.parquet('../data/combined_data/')

In [83]:
window_spec = Window.orderBy("hourly_timestamp").partitionBy("PULocationID")
df = df.withColumn("num_trips", log1p("num_trips"))
df = df.withColumn("num_trips_lag_1", lag("num_trips", 1).over(window_spec))
df = df.withColumn("num_trips_lag_2", lag("num_trips", 2).over(window_spec))
df.show(10)

+------------+-------------------+------------------+------------------+------------+------------------+-------------------+---------------------+------------------+------------------+
|PULocationID|   hourly_timestamp|pickup_hour_of_day|pickup_day_of_week|pickup_month|         num_trips|pickup_num_stations|pickup_daytime_routes|   num_trips_lag_1|   num_trips_lag_2|
+------------+-------------------+------------------+------------------+------------+------------------+-------------------+---------------------+------------------+------------------+
|           7|2022-01-01 00:00:00|                 0|                 7|           1| 5.921578419643816|                  5|                    4|              NULL|              NULL|
|           7|2022-01-01 01:00:00|                 1|                 7|           1| 6.240275845170769|                  5|                    4| 5.921578419643816|              NULL|
|           7|2022-01-01 02:00:00|                 2|                 7|   

In [84]:
# drop rows with null values
df = df.dropna()

In [85]:
df_location_138 = df.filter(col("PULocationID") == 7)
hourly_avg_df = df_location_138.groupBy("pickup_hour_of_day").agg(avg("num_trips").alias("avg_hourly_demand"))
df_location_138 = df_location_138.join(hourly_avg_df, on="pickup_hour_of_day", how="left")

In [86]:
feature_cols = ["pickup_day_of_week", "num_trips_lag_1", "num_trips_lag_2", 'avg_hourly_demand']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data_df = assembler.transform(df_location_138)
data_df = data_df.withColumnRenamed("num_trips", "label")

In [87]:
from pyspark.sql.functions import expm1

train_df, test_df = data_df.randomSplit([0.8, 0.2], seed=42)

# Initialize and train the Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(train_df)

# Make predictions on the test set
log_predictions = lr_model.transform(test_df)
# log_predictions = log_predictions.withColumn("prediction", expm1(col("prediction")))

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(log_predictions)
print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")

# Show sample predictions
log_predictions.select("features", "label", "prediction").show(10)

24/08/21 09:32:48 WARN Instrumentation: [13656ae0] regParam is zero, which might cause numerical instability and overfitting.


Root Mean Squared Error (RMSE) on test data: 0.2739300539336457
+--------------------+------------------+------------------+
|            features|             label|        prediction|
+--------------------+------------------+------------------+
|[3.0,4.6821312271...| 4.465908118654584| 4.694485874147117|
|[7.0,5.6664266881...| 5.641907070938114|5.6085650734971475|
|[2.0,5.0172798368...| 4.941642422609304|4.9848500800784326|
|[7.0,5.9162020626...|  5.84354441703136| 5.830129887509408|
|[6.0,5.1298987149...| 5.153291594497779|5.0811802764988805|
|[3.0,4.8675344504...| 4.418840607796598|   4.8813030093079|
|[2.0,5.2678581590...| 5.187385805840755| 5.117397516021878|
|[1.0,6.1290502100...|6.1903154058531475|   5.8500955622472|
|[4.0,5.0998664278...| 4.955827057601261| 5.024327205177554|
|[5.0,5.3181199938...| 4.983606621708336| 5.261399064255764|
+--------------------+------------------+------------------+
only showing top 10 rows


In [88]:
from pyspark.sql.functions import col

# Create the 0R (persistence) model predictions by shifting the demand column
zero_r_df = df_location_138.withColumn("zero_r_prediction", col("num_trips_lag_1"))

# Drop the first hour since it will have a null previous hour demand
zero_r_df = zero_r_df.dropna()

# Select the actual and 0R predicted values
zero_r_predictions = zero_r_df.select(col("num_trips").alias("label"), col("zero_r_prediction").alias("prediction").astype("double"))

# Evaluate the 0R model using RMSE
zero_r_rmse = evaluator.evaluate(zero_r_predictions)
print(f"Root Mean Squared Error (RMSE) for 0R model: {zero_r_rmse}")

Root Mean Squared Error (RMSE) for 0R model: 0.24756905671615428
