In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, to_date, lag, avg

# Initialize Spark session with increased memory allocation and limited cores
spark = SparkSession.builder \
    .appName("TimeSeriesPrediction") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.cores", "2") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.rpc.askTimeout", "600s") \
    .config("spark.network.timeout", "600s") \
    .getOrCreate()

# Load dataset
file_path = "./Stock Data-New.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True).limit(1000)

# Check if the data is loaded correctly
df.show(5)
df.printSchema()

24/08/01 22:37:24 WARN Utils: Your hostname, blue-nbjupyterhub8 resolves to a loopback address: 127.0.0.1; using 10.0.0.91 instead (on interface ens5)
24/08/01 22:37:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/01 22:37:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+--------+------------+-------------+------------+------------+------------+------------+------------+------------+------------+
|    date|TL BASED ISE|USD BASED ISE|   imkb_x SP|         DAX|        FTSE|      NIKKEI|     BOVESPA|          EU|          EM|
+--------+------------+-------------+------------+------------+------------+------------+------------+------------+------------+
|5-Jan-09| 0.035753708|  0.038376187|-0.004679315| 0.002193419| 0.003894376|         0.0| 0.031190229| 0.012698039| 0.028524462|
|6-Jan-09| 0.025425873|  0.031812743| 0.007786738| 0.008455341| 0.012865611| 0.004162452|  0.01891958| 0.011340652| 0.008772644|
|7-Jan-09| -0.02886173| -0.026352966|-0.030469134|-0.017833062|-0.028734593| 0.017292932|-0.035898576|-0.017072795|-0.020015412|
|8-Jan-09|-0.062208079| -0.084715902| 0.003391364|-0.011726277| -4.65999E-4|-0.040061309| 0.028283152|-0.005560959|-0.019423778|
|9-Jan-09| 0.009859905|  0.009658112|-0.021533208|-0.019872754|-0.012709717|-0.004473502| -0.0097

In [3]:
# Convert date column to date type
df = df.withColumn("date", to_date(col("date"), "d-MMM-yy"))
print("After converting date column:")
df.show(5)

After converting date column:
+----------+------------+-------------+------------+------------+------------+------------+------------+------------+------------+
|      date|TL BASED ISE|USD BASED ISE|   imkb_x SP|         DAX|        FTSE|      NIKKEI|     BOVESPA|          EU|          EM|
+----------+------------+-------------+------------+------------+------------+------------+------------+------------+------------+
|2009-01-05| 0.035753708|  0.038376187|-0.004679315| 0.002193419| 0.003894376|         0.0| 0.031190229| 0.012698039| 0.028524462|
|2009-01-06| 0.025425873|  0.031812743| 0.007786738| 0.008455341| 0.012865611| 0.004162452|  0.01891958| 0.011340652| 0.008772644|
|2009-01-07| -0.02886173| -0.026352966|-0.030469134|-0.017833062|-0.028734593| 0.017292932|-0.035898576|-0.017072795|-0.020015412|
|2009-01-08|-0.062208079| -0.084715902| 0.003391364|-0.011726277| -4.65999E-4|-0.040061309| 0.028283152|-0.005560959|-0.019423778|
|2009-01-09| 0.009859905|  0.009658112|-0.021533208|-

In [4]:
# Repartition the data to optimize performance
df = df.repartition(4)

In [5]:
# Create lag features for USD BASED ISE
windowSpec = Window.orderBy("date")
df = df.withColumn("Lag_USD_BASED_ISE", lag("`USD BASED ISE`", 1).over(windowSpec))
print("After creating lag feature for USD BASED ISE:")
df.show(5)

After creating lag feature for USD BASED ISE:


24/08/01 22:37:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:37:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:37:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:37:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:37:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:37:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

+----------+------------+-------------+------------+------------+------------+------------+------------+------------+------------+-----------------+
|      date|TL BASED ISE|USD BASED ISE|   imkb_x SP|         DAX|        FTSE|      NIKKEI|     BOVESPA|          EU|          EM|Lag_USD_BASED_ISE|
+----------+------------+-------------+------------+------------+------------+------------+------------+------------+------------+-----------------+
|2009-01-05| 0.035753708|  0.038376187|-0.004679315| 0.002193419| 0.003894376|         0.0| 0.031190229| 0.012698039| 0.028524462|             NULL|
|2009-01-06| 0.025425873|  0.031812743| 0.007786738| 0.008455341| 0.012865611| 0.004162452|  0.01891958| 0.011340652| 0.008772644|      0.038376187|
|2009-01-07| -0.02886173| -0.026352966|-0.030469134|-0.017833062|-0.028734593| 0.017292932|-0.035898576|-0.017072795|-0.020015412|      0.031812743|
|2009-01-08|-0.062208079| -0.084715902| 0.003391364|-0.011726277| -4.65999E-4|-0.040061309| 0.028283152|-0

In [6]:
# Create rolling average feature
df = df.withColumn("Rolling_Avg_USD_BASED_ISE", avg("`USD BASED ISE`").over(Window.orderBy("date").rowsBetween(-5, 0)))
print("After creating rolling average feature for USD BASED ISE:")
df.show(5)

After creating rolling average feature for USD BASED ISE:


24/08/01 22:37:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:37:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:37:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:37:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:37:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:37:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

+----------+------------+-------------+------------+------------+------------+------------+------------+------------+------------+-----------------+-------------------------+
|      date|TL BASED ISE|USD BASED ISE|   imkb_x SP|         DAX|        FTSE|      NIKKEI|     BOVESPA|          EU|          EM|Lag_USD_BASED_ISE|Rolling_Avg_USD_BASED_ISE|
+----------+------------+-------------+------------+------------+------------+------------+------------+------------+------------+-----------------+-------------------------+
|2009-01-05| 0.035753708|  0.038376187|-0.004679315| 0.002193419| 0.003894376|         0.0| 0.031190229| 0.012698039| 0.028524462|             NULL|              0.038376187|
|2009-01-06| 0.025425873|  0.031812743| 0.007786738| 0.008455341| 0.012865611| 0.004162452|  0.01891958| 0.011340652| 0.008772644|      0.038376187|              0.035094465|
|2009-01-07| -0.02886173| -0.026352966|-0.030469134|-0.017833062|-0.028734593| 0.017292932|-0.035898576|-0.017072795|-0.02001

In [7]:
# Drop rows with null values (due to lag/rolling calculations)
df = df.dropna(subset=["Lag_USD_BASED_ISE", "Rolling_Avg_USD_BASED_ISE"])
print("After dropping rows with null values:")
df.show(5)

After dropping rows with null values:


24/08/01 22:37:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:37:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:37:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:37:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:37:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:37:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

+----------+------------+-------------+------------+------------+------------+------------+------------+------------+------------+-----------------+-------------------------+
|      date|TL BASED ISE|USD BASED ISE|   imkb_x SP|         DAX|        FTSE|      NIKKEI|     BOVESPA|          EU|          EM|Lag_USD_BASED_ISE|Rolling_Avg_USD_BASED_ISE|
+----------+------------+-------------+------------+------------+------------+------------+------------+------------+------------+-----------------+-------------------------+
|2009-01-06| 0.025425873|  0.031812743| 0.007786738| 0.008455341| 0.012865611| 0.004162452|  0.01891958| 0.011340652| 0.008772644|      0.038376187|              0.035094465|
|2009-01-07| -0.02886173| -0.026352966|-0.030469134|-0.017833062|-0.028734593| 0.017292932|-0.035898576|-0.017072795|-0.020015412|      0.031812743|              0.014611988|
|2009-01-08|-0.062208079| -0.084715902| 0.003391364|-0.011726277| -4.65999E-4|-0.040061309| 0.028283152|-0.005560959|-0.01942

In [8]:
# Verify data before creating feature vector
print("Data before creating feature vector:")
df.select("date", "Lag_USD_BASED_ISE", "Rolling_Avg_USD_BASED_ISE", "USD BASED ISE").show(5)

Data before creating feature vector:


24/08/01 22:38:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

+----------+-----------------+-------------------------+-------------+
|      date|Lag_USD_BASED_ISE|Rolling_Avg_USD_BASED_ISE|USD BASED ISE|
+----------+-----------------+-------------------------+-------------+
|2009-01-06|      0.038376187|              0.035094465|  0.031812743|
|2009-01-07|      0.031812743|              0.014611988| -0.026352966|
|2009-01-08|     -0.026352966|            -0.0102199845| -0.084715902|
|2009-01-09|     -0.084715902|     -0.00624436519999...|  0.009658112|
|2009-01-12|      0.009658112|     -0.01226383016666...| -0.042361155|
+----------+-----------------+-------------------------+-------------+
only showing top 5 rows



In [9]:
# Create feature vector
assembler = VectorAssembler(inputCols=["Lag_USD_BASED_ISE", "Rolling_Avg_USD_BASED_ISE"], outputCol="features")
df = assembler.transform(df)
print("After creating feature vector:")
df.show(5)

After creating feature vector:


24/08/01 22:38:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

+----------+------------+-------------+------------+------------+------------+------------+------------+------------+------------+-----------------+-------------------------+--------------------+
|      date|TL BASED ISE|USD BASED ISE|   imkb_x SP|         DAX|        FTSE|      NIKKEI|     BOVESPA|          EU|          EM|Lag_USD_BASED_ISE|Rolling_Avg_USD_BASED_ISE|            features|
+----------+------------+-------------+------------+------------+------------+------------+------------+------------+------------+-----------------+-------------------------+--------------------+
|2009-01-06| 0.025425873|  0.031812743| 0.007786738| 0.008455341| 0.012865611| 0.004162452|  0.01891958| 0.011340652| 0.008772644|      0.038376187|              0.035094465|[0.038376187,0.03...|
|2009-01-07| -0.02886173| -0.026352966|-0.030469134|-0.017833062|-0.028734593| 0.017292932|-0.035898576|-0.017072795|-0.020015412|      0.031812743|              0.014611988|[0.031812743,0.01...|
|2009-01-08|-0.06220

In [10]:
# Select relevant columns for the model
df = df.select("date", "features", col("`USD BASED ISE`").alias("label"))
print("Final prepared data:")
df.show(5)

Final prepared data:


24/08/01 22:38:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

+----------+--------------------+------------+
|      date|            features|       label|
+----------+--------------------+------------+
|2009-01-06|[0.038376187,0.03...| 0.031812743|
|2009-01-07|[0.031812743,0.01...|-0.026352966|
|2009-01-08|[-0.026352966,-0....|-0.084715902|
|2009-01-09|[-0.084715902,-0....| 0.009658112|
|2009-01-12|[0.009658112,-0.0...|-0.042361155|
+----------+--------------------+------------+
only showing top 5 rows



In [11]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

In [12]:
# Split the data into training and test sets
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Initialize a simpler model (Linear Regression) for testing
lr = LinearRegression(featuresCol="features", labelCol="label")

# Create a pipeline
pipeline = Pipeline(stages=[lr])

# Train the model
model = pipeline.fit(train_df)

# Make predictions
predictions = model.transform(test_df)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")

evaluator_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print(f"R-squared on test data: {r2}")

evaluator_mae = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")
mae = evaluator_mae.evaluate(predictions)
print(f"Mean Absolute Error (MAE) on test data: {mae}")

evaluator_mse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mse")
mse = evaluator_mse.evaluate(predictions)
print(f"Mean Squared Error (MSE) on test data: {mse}")

# Show some predictions
predictions.select("date", "label", "prediction").show(5)

24/08/01 22:38:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2



24/08/01 22:38:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

Root Mean Squared Error (RMSE) on test data: 0.021502033887108557


24/08/01 22:38:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

R-squared on test data: 0.18614354102080677


24/08/01 22:38:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

Mean Absolute Error (MAE) on test data: 0.016548623556085593


24/08/01 22:38:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

Mean Squared Error (MSE) on test data: 0.0004623374612823648


24/08/01 22:38:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+------------+--------------------+
|      date|       label|          prediction|
+----------+------------+--------------------+
|2009-01-08|-0.084715902|-0.00643978408811...|
|2009-01-14| -0.03555248|-0.03133917814488528|
|2009-01-16| 0.032278032|-0.00648583386749...|
|2009-01-23|-0.010007954|-0.00850227769311...|
|2009-02-02|-0.035607494|0.009043012641493094|
+----------+------------+--------------------+
only showing top 5 rows



24/08/01 22:38:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [13]:
# Split the data into training and test sets
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Initialize the Random Forest model
rf = RandomForestRegressor(featuresCol="features", labelCol="label")

# Create a pipeline
pipeline_rf = Pipeline(stages=[rf])

# Train the model
rf_model = pipeline_rf.fit(train_df)

# Make predictions
predictions_rf = rf_model.transform(test_df)

# Initialize the evaluator
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")

# Evaluate Random Forest model
rmse_rf = evaluator.evaluate(predictions_rf, {evaluator.metricName: "rmse"})
mae_rf = evaluator.evaluate(predictions_rf, {evaluator.metricName: "mae"})
r2_rf = evaluator.evaluate(predictions_rf, {evaluator.metricName: "r2"})

print(f"Random Forest - RMSE: {rmse_rf}")
print(f"Random Forest - MAE: {mae_rf}")
print(f"Random Forest - R2: {r2_rf}")

24/08/01 22:38:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2



java.lang.IllegalStateException: problem in scala.concurrent internal callback
	at scala.concurrent.Future$InternalCallbackExecutor$.reportFailure(Future.scala:877)
	at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:72)
	at scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1(Promise.scala:288)
	at scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1$adapted(Promise.scala:288)
	at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:288)
	at scala.concurrent.Promise.complete(Promise.scala:53)
	at scala.concurrent.Promise.complete$(Promise.scala:52)
	at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:187)
	at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33)
	at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(

Random Forest - RMSE: 0.02205546668379079
Random Forest - MAE: 0.016469202237425897
Random Forest - R2: 0.14370928682636175


In [14]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

# Split the data into training and test sets
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Initialize the Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="label")

# Create a pipeline for Linear Regression
pipeline_lr = Pipeline(stages=[lr])

# Train the Linear Regression model
lr_model = pipeline_lr.fit(train_df)

# Evaluate Linear Regression model
predictions_lr = lr_model.transform(test_df)

# Define evaluators
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_mae = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")
evaluator_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse_lr = evaluator_rmse.evaluate(predictions_lr)
mae_lr = evaluator_mae.evaluate(predictions_lr)
r2_lr = evaluator_r2.evaluate(predictions_lr)

print(f"Linear Regression - RMSE: {rmse_lr}")
print(f"Linear Regression - MAE: {mae_lr}")
print(f"Linear Regression - R2: {r2_lr}")

# Initialize the Random Forest model
rf = RandomForestRegressor(featuresCol="features", labelCol="label")

# Create a pipeline for Random Forest
pipeline_rf = Pipeline(stages=[rf])

# Train the Random Forest model
rf_model = pipeline_rf.fit(train_df)

# Evaluate Random Forest model
predictions_rf = rf_model.transform(test_df)

rmse_rf = evaluator_rmse.evaluate(predictions_rf)
mae_rf = evaluator_mae.evaluate(predictions_rf)
r2_rf = evaluator_r2.evaluate(predictions_rf)

print(f"Random Forest - RMSE: {rmse_rf}")
print(f"Random Forest - MAE: {mae_rf}")
print(f"Random Forest - R2: {r2_rf}")

24/08/01 22:38:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

Linear Regression - RMSE: 0.021502033887108557
Linear Regression - MAE: 0.016548623556085593
Linear Regression - R2: 0.18614354102080677


24/08/01 22:38:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:38:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

Random Forest - RMSE: 0.02205546668379079
Random Forest - MAE: 0.016469202237425897
Random Forest - R2: 0.14370928682636175


In [15]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

# Initialize the models
lr = LinearRegression(featuresCol="features", labelCol="label")
rf = RandomForestRegressor(featuresCol="features", labelCol="label")
gbt = GBTRegressor(featuresCol="features", labelCol="label")

# Define the parameter grid for Linear Regression
paramGrid_lr = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# TrainValidationSplit for Linear Regression
tvs_lr = TrainValidationSplit(estimator=lr,
                              estimatorParamMaps=paramGrid_lr,
                              evaluator=RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse"),
                              trainRatio=0.8)

# Train the Linear Regression model with hyperparameter tuning
tvs_lr_model = tvs_lr.fit(train_df)

# Evaluate the tuned Linear Regression model
predictions_tvs_lr = tvs_lr_model.transform(test_df)

# Initialize the evaluators
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_mae = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")
evaluator_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

# Evaluate the model
rmse_tvs_lr = evaluator_rmse.evaluate(predictions_tvs_lr)
mae_tvs_lr = evaluator_mae.evaluate(predictions_tvs_lr)
r2_tvs_lr = evaluator_r2.evaluate(predictions_tvs_lr)

print(f"Tuned Linear Regression - RMSE: {rmse_tvs_lr}")
print(f"Tuned Linear Regression - MAE: {mae_tvs_lr}")
print(f"Tuned Linear Regression - R2: {r2_tvs_lr}")

# Define the parameter grid for Random Forest
paramGrid_rf = ParamGridBuilder() \
    .addGrid(rf.numTrees, [5, 10]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

# TrainValidationSplit for Random Forest
tvs_rf = TrainValidationSplit(estimator=rf,
                              estimatorParamMaps=paramGrid_rf,
                              evaluator=RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse"),
                              trainRatio=0.8)

# Train the Random Forest model with hyperparameter tuning
tvs_rf_model = tvs_rf.fit(train_df)

# Evaluate the tuned Random Forest model
predictions_tvs_rf = tvs_rf_model.transform(test_df)

# Evaluate the model
rmse_tvs_rf = evaluator_rmse.evaluate(predictions_tvs_rf)
mae_tvs_rf = evaluator_mae.evaluate(predictions_tvs_rf)
r2_tvs_rf = evaluator_r2.evaluate(predictions_tvs_rf)

print(f"Tuned Random Forest - RMSE: {rmse_tvs_rf}")
print(f"Tuned Random Forest - MAE: {mae_tvs_rf}")
print(f"Tuned Random Forest - R2: {r2_tvs_rf}")

# Define the parameter grid for Gradient-Boosted Trees (GBT)
paramGrid_gbt = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [5, 10]) \
    .addGrid(gbt.maxIter, [10, 20]) \
    .build()

# TrainValidationSplit for Gradient-Boosted Trees
tvs_gbt = TrainValidationSplit(estimator=gbt,
                               estimatorParamMaps=paramGrid_gbt,
                               evaluator=RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse"),
                               trainRatio=0.8)

# Train the Gradient-Boosted Trees model with hyperparameter tuning
tvs_gbt_model = tvs_gbt.fit(train_df)

# Evaluate the tuned Gradient-Boosted Trees model
predictions_tvs_gbt = tvs_gbt_model.transform(test_df)

# Evaluate the model
rmse_tvs_gbt = evaluator_rmse.evaluate(predictions_tvs_gbt)
mae_tvs_gbt = evaluator_mae.evaluate(predictions_tvs_gbt)
r2_tvs_gbt = evaluator_r2.evaluate(predictions_tvs_gbt)

print(f"Tuned GBT - RMSE: {rmse_tvs_gbt}")
print(f"Tuned GBT - MAE: {mae_tvs_gbt}")
print(f"Tuned GBT - R2: {r2_tvs_gbt}")

24/08/01 22:39:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:39:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:39:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:39:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:39:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:39:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

Tuned Linear Regression - RMSE: 0.02204780241524362
Tuned Linear Regression - MAE: 0.016301465560962895
Tuned Linear Regression - R2: 0.1443043049940934


24/08/01 22:39:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:39:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:39:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:39:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:39:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:39:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

Tuned Random Forest - RMSE: 0.021829621851032117
Tuned Random Forest - MAE: 0.016635754032512422
Tuned Random Forest - R2: 0.1611560902548561


24/08/01 22:39:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:39:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:39:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:39:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:39:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:39:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

Tuned GBT - RMSE: 0.022974524878587615
Tuned GBT - MAE: 0.01782750984709159
Tuned GBT - R2: 0.07085860715306247


In [16]:
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, col, lag, when 
from pyspark.sql.types import FloatType
from pyspark.sql.window import Window

# Define the index of the "USD BASED ISE" feature in the vector
usd_based_ise_index = 1  # Update this index according to the actual position

# UDF to extract specific element from vector
extract_element_udf = udf(lambda v: float(v[usd_based_ise_index]), FloatType())

# Extract "USD BASED ISE" feature into a separate column
df = df.withColumn("USD_BASED_ISE", extract_element_udf(col("features")))

# Define window specification
window = Window.orderBy("date")

# Calculate previous value and class label
df = df.withColumn("prev_value", lag(col("USD_BASED_ISE")).over(window))
df = df.withColumn("class_label", when(col("USD_BASED_ISE") > col("prev_value"), 1).otherwise(0))

# Display some rows to check the result
df.show()

24/08/01 22:40:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:40:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:40:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:40:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:40:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:40:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

+----------+--------------------+------------+-------------+-------------+-----------+
|      date|            features|       label|USD_BASED_ISE|   prev_value|class_label|
+----------+--------------------+------------+-------------+-------------+-----------+
|2009-01-06|[0.038376187,0.03...| 0.031812743|  0.035094466|         NULL|          0|
|2009-01-07|[0.031812743,0.01...|-0.026352966|  0.014611988|  0.035094466|          0|
|2009-01-08|[-0.026352966,-0....|-0.084715902| -0.010219985|  0.014611988|          0|
|2009-01-09|[-0.084715902,-0....| 0.009658112| -0.006244365| -0.010219985|          1|
|2009-01-12|[0.009658112,-0.0...|-0.042361155|  -0.01226383| -0.006244365|          0|
|2009-01-13|[-0.042361155,-0....| -2.72183E-4| -0.018705225|  -0.01226383|          0|
|2009-01-14|[-2.72183E-4,-0.0...| -0.03555248| -0.029932762| -0.018705225|          0|
|2009-01-15|[-0.03555248,-0.0...|-0.017267844| -0.028418574| -0.029932762|          1|
|2009-01-16|[-0.017267844,-0....| 0.0322780

24/08/01 22:40:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:40:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [17]:
# Import necessary libraries for classification
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Assuming your classification target column is 'class_label'

# Split the data into training and test sets for classification
train_df_class, test_df_class = df.randomSplit([0.8, 0.2], seed=42)

# Initialize classifiers
lr_class = LogisticRegression(featuresCol="features", labelCol="class_label")
rf_class = RandomForestClassifier(featuresCol="features", labelCol="class_label")
gbt_class = GBTClassifier(featuresCol="features", labelCol="class_label")

# Example: Hyperparameter tuning for Logistic Regression Classifier
paramGrid_lr_class = ParamGridBuilder() \
    .addGrid(lr_class.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr_class.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

cv_lr_class = CrossValidator(estimator=lr_class,
                             estimatorParamMaps=paramGrid_lr_class,
                             evaluator=MulticlassClassificationEvaluator(labelCol="class_label", predictionCol="prediction", metricName="accuracy"),
                             numFolds=3)

cv_lr_class_model = cv_lr_class.fit(train_df_class)
predictions_lr_class = cv_lr_class_model.transform(test_df_class)

evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="class_label", predictionCol="prediction", metricName="accuracy")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="class_label", predictionCol="prediction", metricName="f1")

accuracy_lr_class = evaluator_accuracy.evaluate(predictions_lr_class)
f1_lr_class = evaluator_f1.evaluate(predictions_lr_class)

print(f"Logistic Regression Classifier - Accuracy: {accuracy_lr_class}")
print(f"Logistic Regression Classifier - F1 Score: {f1_lr_class}")

24/08/01 22:42:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:42:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:42:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:42:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:42:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:42:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

Logistic Regression Classifier - Accuracy: 0.5365853658536586
Logistic Regression Classifier - F1 Score: 0.5365853658536585


In [None]:
# Hyperparameter tuning for Random Forest Classifier
paramGrid_rf_class = ParamGridBuilder() \
    .addGrid(rf_class.numTrees, [10, 20, 30]) \
    .addGrid(rf_class.maxDepth, [5, 10, 15]) \
    .build()

cv_rf_class = CrossValidator(estimator=rf_class,
                             estimatorParamMaps=paramGrid_rf_class,
                             evaluator=MulticlassClassificationEvaluator(labelCol="class_label", predictionCol="prediction", metricName="accuracy"),
                             numFolds=3)

cv_rf_class_model = cv_rf_class.fit(train_df_class)
predictions_rf_class = cv_rf_class_model.transform(test_df_class)

accuracy_rf_class = evaluator_accuracy.evaluate(predictions_rf_class)
f1_rf_class = evaluator_f1.evaluate(predictions_rf_class)

print(f"Random Forest Classifier - Accuracy: {accuracy_rf_class}")
print(f"Random Forest Classifier - F1 Score: {f1_rf_class}")

24/08/01 22:43:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:43:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:43:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:43:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:43:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 22:43:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/08/01 2

In [26]:
# Hyperparameter tuning for Gradient-Boosted Trees Classifier
paramGrid_gbt_class = ParamGridBuilder() \
    .addGrid(gbt_class.maxDepth, [5, 10]) \
    .addGrid(gbt_class.maxIter, [10, 20]) \
    .build()

cv_gbt_class = CrossValidator(estimator=gbt_class,
                              estimatorParamMaps=paramGrid_gbt_class,
                              evaluator=MulticlassClassificationEvaluator(labelCol="class_label", predictionCol="prediction", metricName="accuracy"),
                              numFolds=3)

cv_gbt_class_model = cv_gbt_class.fit(train_df_class)
predictions_gbt_class = cv_gbt_class_model.transform(test_df_class)

accuracy_gbt_class = evaluator_accuracy.evaluate(predictions_gbt_class)
f1_gbt_class = evaluator_f1.evaluate(predictions_gbt_class)

print(f"GBT Classifier - Accuracy: {accuracy_gbt_class}")
print(f"GBT Classifier - F1 Score: {f1_gbt_class}")

24/07/30 14:04:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/30 14:04:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/30 14:04:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/30 14:04:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/30 14:04:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/30 14:04:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/07/30 1

GBT Classifier - Accuracy: 0.573170731707317
GBT Classifier - F1 Score: 0.5729800410438952


24/07/30 14:06:13 ERROR BlockManagerMasterEndpoint: Fail to know the executor driver is alive or not.
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint$$anonfun$handleBlockRemovalFailure$1.applyOrElse(BlockManag

In [None]:
# Stop the current Spark session
spark.stop()