In [1]:
# Import required libraries
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import pandas_udf, PandasUDFType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import sys

In [2]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Sentiment Calculation RDD") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "3") \
    .config("spark.dynamicAllocation.maxExecutors", "9") \
    .config("spark.dynamicAllocation.initialExecutors", "3") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.extraPythonPackages", "vaderSentiment") \
    .config("spark.driver.extraPythonPackages", "vaderSentiment") \
    .config("spark.executorEnv.PYTHONPATH", ":".join(sys.path)) \
    .getOrCreate()

# Set log level for cleaner outputs
sc = spark.sparkContext
sc.setLogLevel("INFO")


In [3]:
# HDFS path for the dataset
file_path = "hdfs://namenode:9000/data/cleaned_dataset.parquet"

# Read the Parquet file into a DataFrame
df = spark.read.parquet(file_path)

# Display schema and sample rows
df.printSchema()
df.show(5, truncate=False)

12:20:49.232 [Thread-4] INFO  org.apache.spark.sql.internal.SharedState - Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
12:20:49.239 [Thread-4] INFO  org.apache.spark.sql.internal.SharedState - Warehouse path is 'file:/home/ubuntu/project/cluster-notebooks/spark-warehouse'.
12:20:49.248 [Thread-4] INFO  org.apache.spark.ui.ServerInfo - Adding filter to /SQL: org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter
12:20:49.250 [Thread-4] INFO  org.sparkproject.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@5178ac23{/SQL,null,AVAILABLE,@Spark}
12:20:49.250 [Thread-4] INFO  org.apache.spark.ui.ServerInfo - Adding filter to /SQL/json: org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter
12:20:49.251 [Thread-4] INFO  org.sparkproject.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@50e9ad63{/SQL/json,null,AVAILABLE,@Spark}
12:20:49.251 [Thread-4] INFO  org.apache.spark.ui.ServerInfo - A

[Stage 0:>                                                          (0 + 1) / 1]

12:20:52.180 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 0.0 in stage 0.0 (TID 0) in 1805 ms on datanode2 (executor 1) (1/1)
12:20:52.183 [task-result-getter-0] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Removed TaskSet 0.0, whose tasks have all completed, from pool 
12:20:52.188 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - ResultStage 0 (parquet at NativeMethodAccessorImpl.java:0) finished in 1.923 s
12:20:52.192 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Job 0 is finished. Cancelling potential speculative or zombie tasks for this job
12:20:52.193 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Killing all running tasks in stage 0: Stage finished
12:20:52.195 [Thread-4] INFO  org.apache.spark.scheduler.DAGScheduler - Job 0 finished: parquet at NativeMethodAccessorImpl.java:0, took 2.000233 s


                                                                                

12:20:52.422 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Removed broadcast_0_piece0 on namenode:40137 in memory (size: 46.9 KiB, free: 2004.6 MiB)
12:20:52.438 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Removed broadcast_0_piece0 on datanode2:39793 in memory (size: 46.9 KiB, free: 912.3 MiB)
root
 |-- link: string (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- score: long (nullable = true)
 |-- sub_reddit: string (nullable = true)
 |-- post_id: string (nullable = true)
 |-- comment_id: string (nullable = true)
 |-- body: string (nullable = true)

12:20:53.210 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileSourceStrategy - Pushed Filters: 
12:20:53.211 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileSourceStrategy - Post-Scan Filters: 
12:20:53.554 [Thread-4] INFO  org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - Code generated in 185.934852 ms


[Stage 1:>                                                          (0 + 1) / 1]

12:20:55.049 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 0.0 in stage 1.0 (TID 1) in 1358 ms on datanode2 (executor 1) (1/1)
12:20:55.049 [task-result-getter-1] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Removed TaskSet 1.0, whose tasks have all completed, from pool 
12:20:55.050 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - ResultStage 1 (showString at NativeMethodAccessorImpl.java:0) finished in 1.399 s
12:20:55.051 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Job 1 is finished. Cancelling potential speculative or zombie tasks for this job
12:20:55.051 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Killing all running tasks in stage 1: Stage finished
12:20:55.052 [Thread-4] INFO  org.apache.spark.scheduler.DAGScheduler - Job 1 finished: showString at NativeMethodAccessorImpl.java:0, took 1.415584 s


                                                                                

12:20:55.953 [Thread-4] INFO  org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - Code generated in 20.469073 ms
+----------------------------------------------------------------------------------------------------------------+-----------+-----+------------+-------+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|link                                                          

In [4]:
# Function to clean comments
def clean_comment_spark(df, column):
    """Clean comments in the specified column."""
    return df.withColumn(
        f"{column}_clean",
        F.trim(
            F.regexp_replace(
                F.regexp_replace(
                    F.regexp_replace(
                        F.lower(F.col(column)),  # Convert to lowercase
                        r"http\S+|www\S+|https\S+", ""),  # Remove URLs
                    r"@\w+|#", ""),  # Remove mentions and hashtags
                r"[^\w\s]", ""),  # Remove special characters and punctuation
            )
        )


In [5]:
# Clean the comments and select relevant columns
df = clean_comment_spark(df, "body").select("comment_id", "body_clean")
df.show(5, truncate=False)

12:20:56.132 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileSourceStrategy - Pushed Filters: 
12:20:56.133 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileSourceStrategy - Post-Scan Filters: 
12:20:56.205 [Thread-4] INFO  org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - Code generated in 43.896746 ms
12:20:56.209 [Thread-4] INFO  org.apache.spark.storage.memory.MemoryStore - Block broadcast_3 stored as values in memory (estimated size 408.1 KiB, free 2003.7 MiB)
12:20:56.220 [Thread-4] INFO  org.apache.spark.storage.memory.MemoryStore - Block broadcast_3_piece0 stored as bytes in memory (estimated size 44.8 KiB, free 2003.7 MiB)
12:20:56.220 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_3_piece0 in memory on namenode:40137 (size: 44.8 KiB, free: 2004.5 MiB)
12:20:56.222 [Thread-4] INFO  org.apache.spark.SparkContext - Created broadcast 3 from showString at NativeMethodAccessorImpl.java

In [6]:
# Convert DataFrame to RDD for sentiment analysis
data_rdd = df.rdd

12:20:56.555 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileSourceStrategy - Pushed Filters: 
12:20:56.555 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileSourceStrategy - Post-Scan Filters: 
12:20:56.590 [Thread-4] INFO  org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - Code generated in 22.564544 ms
12:20:56.594 [Thread-4] INFO  org.apache.spark.storage.memory.MemoryStore - Block broadcast_5 stored as values in memory (estimated size 408.1 KiB, free 2003.7 MiB)
12:20:56.601 [Thread-4] INFO  org.apache.spark.storage.memory.MemoryStore - Block broadcast_5_piece0 stored as bytes in memory (estimated size 44.8 KiB, free 2003.7 MiB)
12:20:56.602 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_5_piece0 in memory on namenode:40137 (size: 44.8 KiB, free: 2004.5 MiB)
12:20:56.605 [Thread-4] INFO  org.apache.spark.SparkContext - Created broadcast 5 from javaToPython at NativeMethodAccessorImpl.ja

In [7]:
# Define the function to calculate sentiment for a partition
def calculate_sentiment_partition(rows):
    analyzer = SentimentIntensityAnalyzer()  # Initialize the analyzer once per partition
    results = []
    for row in rows:
        comment_id = row["comment_id"]
        text = row["body_clean"]
        sentiment_score = analyzer.polarity_scores(text)['compound'] if text else None
        results.append((comment_id, sentiment_score))
    return results

In [8]:
# Apply the function to each partition
sentiment_rdd = data_rdd.mapPartitions(calculate_sentiment_partition)

# Convert back to DataFrame
sentiment_df = sentiment_rdd.toDF(["comment_id", "sentiment"])

12:20:56.681 [Thread-4] INFO  org.apache.spark.SparkContext - Starting job: runJob at PythonRDD.scala:181
12:20:56.685 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Got job 3 (runJob at PythonRDD.scala:181) with 1 output partitions
12:20:56.686 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Final stage: ResultStage 3 (runJob at PythonRDD.scala:181)
12:20:56.686 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Parents of final stage: List()
12:20:56.688 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Missing parents: List()
12:20:56.689 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Submitting ResultStage 3 (PythonRDD[16] at RDD at PythonRDD.scala:53), which has no missing parents
12:20:56.710 [dag-scheduler-event-loop] INFO  org.apache.spark.storage.memory.MemoryStore - Block broadcast_6 stored as values in memory (estimated size 30.1 KiB, free 2003.

[Stage 3:>                                                          (0 + 1) / 1]

12:20:58.528 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_5_piece0 in memory on datanode1:44859 (size: 44.8 KiB, free: 912.2 MiB)


[Stage 3:>                                                          (0 + 1) / 1]

12:22:27.631 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 0.0 in stage 3.0 (TID 3) in 90911 ms on datanode1 (executor 3) (1/1)
12:22:27.632 [task-result-getter-3] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Removed TaskSet 3.0, whose tasks have all completed, from pool 
12:22:27.637 [dag-scheduler-event-loop] INFO  org.apache.spark.api.python.PythonAccumulatorV2 - Connected to AccumulatorServer at host: 127.0.0.1 port: 51923
12:22:27.639 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - ResultStage 3 (runJob at PythonRDD.scala:181) finished in 90.944 s
12:22:27.639 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Job 3 is finished. Cancelling potential speculative or zombie tasks for this job
12:22:27.639 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Killing all running tasks in stage 3: Stage finished
12:22:27.640 [Thread-4] INFO  org.apache.s

                                                                                

In [9]:
# Write the results to HDFS in Parquet format
output_path = "hdfs://namenode:9000/data/results/comment_sentiment.parquet"
sentiment_df.write.mode("overwrite").parquet(output_path)

12:22:27.798 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.parquet.ParquetUtils - Using default output committer for Parquet: org.apache.parquet.hadoop.ParquetOutputCommitter
12:22:27.811 [Thread-4] INFO  org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - File Output Committer Algorithm version is 1
12:22:27.811 [Thread-4] INFO  org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
12:22:27.812 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol - Using user defined output committer class org.apache.parquet.hadoop.ParquetOutputCommitter
12:22:27.812 [Thread-4] INFO  org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - File Output Committer Algorithm version is 1
12:22:27.812 [Thread-4] INFO  org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - FileOutputCommitter skip cleanup _temporary f

[Stage 4:>                                                         (0 + 6) / 45]

12:22:28.782 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_5_piece0 in memory on datanode2:39793 (size: 44.8 KiB, free: 912.1 MiB)
12:22:28.989 [spark-dynamic-executor-allocation] INFO  org.apache.spark.ExecutorAllocationManager - Requesting 1 new executor because tasks are backlogged (new desired total will be 4 for resource profile id: 0)
12:22:30.004 [spark-dynamic-executor-allocation] INFO  org.apache.spark.ExecutorAllocationManager - Requesting 2 new executors because tasks are backlogged (new desired total will be 6 for resource profile id: 0)
12:22:30.010 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_5_piece0 in memory on datanode3:33879 (size: 44.8 KiB, free: 912.2 MiB)
12:22:31.018 [spark-dynamic-executor-allocation] INFO  org.apache.spark.ExecutorAllocationManager - Requesting 3 new executors because tasks are backlogged (new desired total will be 9 for resource profile id:

[Stage 4:>                                                         (0 + 8) / 45]

12:22:33.266 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_7_piece0 in memory on datanode2:38315 (size: 106.3 KiB, free: 912.2 MiB)
12:22:33.351 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnDriverEndpoint - Registered executor NettyRpcEndpointRef(spark-client://Executor) (192.168.12.196:48010) with ID 6,  ResourceProfileId 0
12:22:33.352 [spark-listener-group-executorManagement] INFO  org.apache.spark.scheduler.dynalloc.ExecutorMonitor - New executor 6 has registered (new total is 5)
12:22:33.408 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerMasterEndpoint - Registering block manager datanode1:42309 with 912.3 MiB RAM, BlockManagerId(6, datanode1, 42309, None)
12:22:33.448 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 8.0 in stage 4.0 (TID 12) (datanode1, executor 6, partition 8, NODE_LOCAL, 9690 

[Stage 4:>                                                        (0 + 10) / 45]

12:22:33.725 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_7_piece0 in memory on datanode1:42309 (size: 106.3 KiB, free: 912.2 MiB)
12:22:34.577 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnDriverEndpoint - Registered executor NettyRpcEndpointRef(spark-client://Executor) (192.168.12.118:60710) with ID 5,  ResourceProfileId 0
12:22:34.579 [spark-listener-group-executorManagement] INFO  org.apache.spark.scheduler.dynalloc.ExecutorMonitor - New executor 5 has registered (new total is 6)
12:22:34.738 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerMasterEndpoint - Registering block manager datanode3:36221 with 912.3 MiB RAM, BlockManagerId(5, datanode3, 36221, None)
12:22:34.821 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 10.0 in stage 4.0 (TID 14) (datanode3, executor 5, partition 10, NODE_LOCAL, 969

[Stage 4:>                                                        (0 + 12) / 45]

12:22:35.231 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_7_piece0 in memory on datanode3:36221 (size: 106.3 KiB, free: 912.2 MiB)
12:22:35.492 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_5_piece0 in memory on datanode2:38315 (size: 44.8 KiB, free: 912.2 MiB)
12:22:35.716 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_5_piece0 in memory on datanode1:42309 (size: 44.8 KiB, free: 912.2 MiB)
12:22:37.602 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_5_piece0 in memory on datanode3:36221 (size: 44.8 KiB, free: 912.2 MiB)


[Stage 4:>                                                        (0 + 12) / 45]

12:24:01.963 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 12.0 in stage 4.0 (TID 16) (datanode1, executor 3, partition 12, NODE_LOCAL, 9690 bytes) 
12:24:01.967 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 4.0 in stage 4.0 (TID 8) in 94094 ms on datanode1 (executor 3) (1/45)


[Stage 4:=>                                                       (1 + 12) / 45]

12:24:04.214 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 13.0 in stage 4.0 (TID 17) (datanode1, executor 3, partition 13, NODE_LOCAL, 9690 bytes) 
12:24:04.215 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 1.0 in stage 4.0 (TID 5) in 96343 ms on datanode1 (executor 3) (2/45)


[Stage 4:==>                                                      (2 + 12) / 45]

12:24:04.622 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 14.0 in stage 4.0 (TID 18) (datanode2, executor 1, partition 14, NODE_LOCAL, 9690 bytes) 
12:24:04.622 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 0.0 in stage 4.0 (TID 4) in 96750 ms on datanode2 (executor 1) (3/45)


[Stage 4:===>                                                     (3 + 12) / 45]

12:24:06.073 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 15.0 in stage 4.0 (TID 19) (datanode3, executor 2, partition 15, NODE_LOCAL, 9690 bytes) 
12:24:06.074 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 5.0 in stage 4.0 (TID 9) in 98201 ms on datanode3 (executor 2) (4/45)


[Stage 4:=====>                                                   (4 + 12) / 45]

12:24:07.007 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 16.0 in stage 4.0 (TID 20) (datanode2, executor 1, partition 16, NODE_LOCAL, 9690 bytes) 
12:24:07.008 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 3.0 in stage 4.0 (TID 7) in 99135 ms on datanode2 (executor 1) (5/45)




12:24:08.245 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 17.0 in stage 4.0 (TID 21) (datanode2, executor 4, partition 17, NODE_LOCAL, 9690 bytes) 
12:24:08.246 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 7.0 in stage 4.0 (TID 11) in 95333 ms on datanode2 (executor 4) (6/45)




12:24:09.066 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 18.0 in stage 4.0 (TID 22) (datanode3, executor 2, partition 18, NODE_LOCAL, 9690 bytes) 
12:24:09.067 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 2.0 in stage 4.0 (TID 6) in 101195 ms on datanode3 (executor 2) (7/45)




12:24:10.558 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 19.0 in stage 4.0 (TID 23) (datanode2, executor 4, partition 19, NODE_LOCAL, 9690 bytes) 
12:24:10.559 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 6.0 in stage 4.0 (TID 10) in 97647 ms on datanode2 (executor 4) (8/45)




12:24:11.136 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 20.0 in stage 4.0 (TID 24) (datanode1, executor 6, partition 20, NODE_LOCAL, 9690 bytes) 
12:24:11.137 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 8.0 in stage 4.0 (TID 12) in 97690 ms on datanode1 (executor 6) (9/45)




12:24:12.008 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 21.0 in stage 4.0 (TID 25) (datanode1, executor 6, partition 21, NODE_LOCAL, 9690 bytes) 
12:24:12.010 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 9.0 in stage 4.0 (TID 13) in 98562 ms on datanode1 (executor 6) (10/45)




12:24:12.256 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 22.0 in stage 4.0 (TID 26) (datanode3, executor 5, partition 22, NODE_LOCAL, 9690 bytes) 
12:24:12.257 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 11.0 in stage 4.0 (TID 15) in 97436 ms on datanode3 (executor 5) (11/45)
12:24:15.367 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 23.0 in stage 4.0 (TID 27) (datanode3, executor 5, partition 23, NODE_LOCAL, 9690 bytes) 
12:24:15.367 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 10.0 in stage 4.0 (TID 14) in 100547 ms on datanode3 (executor 5) (12/45)




12:25:32.056 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 24.0 in stage 4.0 (TID 28) (datanode1, executor 3, partition 24, NODE_LOCAL, 9690 bytes) 
12:25:32.057 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 12.0 in stage 4.0 (TID 16) in 90095 ms on datanode1 (executor 3) (13/45)




12:25:37.397 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 25.0 in stage 4.0 (TID 29) (datanode2, executor 1, partition 25, NODE_LOCAL, 9690 bytes) 
12:25:37.399 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 16.0 in stage 4.0 (TID 20) in 90392 ms on datanode2 (executor 1) (14/45)
12:25:37.586 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 26.0 in stage 4.0 (TID 30) (datanode2, executor 1, partition 26, NODE_LOCAL, 9690 bytes) 
12:25:37.590 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 14.0 in stage 4.0 (TID 18) in 92968 ms on datanode2 (executor 1) (15/45)




12:25:40.886 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 27.0 in stage 4.0 (TID 31) (datanode2, executor 4, partition 27, NODE_LOCAL, 9690 bytes) 
12:25:40.889 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 17.0 in stage 4.0 (TID 21) in 92645 ms on datanode2 (executor 4) (16/45)
12:25:40.909 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 28.0 in stage 4.0 (TID 32) (datanode1, executor 3, partition 28, NODE_LOCAL, 9690 bytes) 
12:25:40.910 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 13.0 in stage 4.0 (TID 17) in 96697 ms on datanode1 (executor 3) (17/45)




12:25:41.315 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 29.0 in stage 4.0 (TID 33) (datanode1, executor 6, partition 29, NODE_LOCAL, 9690 bytes) 
12:25:41.317 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 21.0 in stage 4.0 (TID 25) in 89309 ms on datanode1 (executor 6) (18/45)




12:25:41.741 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 30.0 in stage 4.0 (TID 34) (datanode3, executor 2, partition 30, NODE_LOCAL, 9690 bytes) 
12:25:41.742 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 15.0 in stage 4.0 (TID 19) in 95670 ms on datanode3 (executor 2) (19/45)




12:25:42.091 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 31.0 in stage 4.0 (TID 35) (datanode3, executor 2, partition 31, NODE_LOCAL, 9690 bytes) 
12:25:42.092 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 18.0 in stage 4.0 (TID 22) in 93026 ms on datanode3 (executor 2) (20/45)




12:25:44.572 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 32.0 in stage 4.0 (TID 36) (datanode3, executor 5, partition 32, NODE_LOCAL, 9690 bytes) 
12:25:44.573 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 22.0 in stage 4.0 (TID 26) in 92317 ms on datanode3 (executor 5) (21/45)




12:25:45.353 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 33.0 in stage 4.0 (TID 37) (datanode1, executor 6, partition 33, NODE_LOCAL, 9690 bytes) 
12:25:45.354 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 20.0 in stage 4.0 (TID 24) in 94219 ms on datanode1 (executor 6) (22/45)




12:25:46.590 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 34.0 in stage 4.0 (TID 38) (datanode2, executor 4, partition 34, NODE_LOCAL, 9690 bytes) 
12:25:46.591 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 19.0 in stage 4.0 (TID 23) in 96033 ms on datanode2 (executor 4) (23/45)




12:25:50.584 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 35.0 in stage 4.0 (TID 39) (datanode3, executor 5, partition 35, NODE_LOCAL, 9690 bytes) 
12:25:50.586 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 23.0 in stage 4.0 (TID 27) in 95220 ms on datanode3 (executor 5) (24/45)




12:27:06.585 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 36.0 in stage 4.0 (TID 40) (datanode1, executor 3, partition 36, NODE_LOCAL, 9690 bytes) 
12:27:06.586 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 24.0 in stage 4.0 (TID 28) in 94531 ms on datanode1 (executor 3) (25/45)




12:27:08.874 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 37.0 in stage 4.0 (TID 41) (datanode2, executor 1, partition 37, NODE_LOCAL, 9690 bytes) 
12:27:08.875 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 25.0 in stage 4.0 (TID 29) in 91478 ms on datanode2 (executor 1) (26/45)




12:27:09.825 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 38.0 in stage 4.0 (TID 42) (datanode1, executor 3, partition 38, NODE_LOCAL, 9690 bytes) 
12:27:09.825 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 28.0 in stage 4.0 (TID 32) in 88916 ms on datanode1 (executor 3) (27/45)




12:27:10.562 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 39.0 in stage 4.0 (TID 43) (datanode3, executor 2, partition 39, NODE_LOCAL, 9690 bytes) 
12:27:10.563 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 31.0 in stage 4.0 (TID 35) in 88473 ms on datanode3 (executor 2) (28/45)




12:27:12.108 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 40.0 in stage 4.0 (TID 44) (datanode1, executor 6, partition 40, NODE_LOCAL, 9690 bytes) 
12:27:12.109 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 29.0 in stage 4.0 (TID 33) in 90794 ms on datanode1 (executor 6) (29/45)




12:27:12.528 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 41.0 in stage 4.0 (TID 45) (datanode2, executor 4, partition 41, NODE_LOCAL, 9690 bytes) 
12:27:12.529 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 34.0 in stage 4.0 (TID 38) in 85940 ms on datanode2 (executor 4) (30/45)




12:27:13.338 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 42.0 in stage 4.0 (TID 46) (datanode2, executor 4, partition 42, NODE_LOCAL, 9690 bytes) 
12:27:13.339 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 27.0 in stage 4.0 (TID 31) in 92454 ms on datanode2 (executor 4) (31/45)
12:27:13.370 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 43.0 in stage 4.0 (TID 47) (datanode2, executor 1, partition 43, NODE_LOCAL, 9690 bytes) 
12:27:13.371 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 26.0 in stage 4.0 (TID 30) in 95785 ms on datanode2 (executor 1) (32/45)




12:27:14.411 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 44.0 in stage 4.0 (TID 48) (datanode1, executor 6, partition 44, NODE_LOCAL, 9864 bytes) 
12:27:14.412 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 33.0 in stage 4.0 (TID 37) in 89059 ms on datanode1 (executor 6) (33/45)




12:27:17.768 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 35.0 in stage 4.0 (TID 39) in 87184 ms on datanode3 (executor 5) (34/45)
12:27:17.797 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 30.0 in stage 4.0 (TID 34) in 96056 ms on datanode3 (executor 2) (35/45)




12:27:19.818 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 32.0 in stage 4.0 (TID 36) in 95246 ms on datanode3 (executor 5) (36/45)




12:28:19.860 [spark-dynamic-executor-allocation] INFO  org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend - Requesting to kill executor(s) 5
12:28:19.866 [spark-dynamic-executor-allocation] INFO  org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend - Actual list of executor(s) to be killed is 5
12:28:19.924 [spark-dynamic-executor-allocation] INFO  org.apache.spark.ExecutorAllocationManager - Executors 5 removed due to idle timeout.




12:28:21.242 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnDriverEndpoint - Disabling executor 5.
12:28:21.250 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Executor lost: 5 (epoch 0)
12:28:21.251 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerMasterEndpoint - Trying to remove executor 5 from BlockManagerMaster.
12:28:21.252 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerMasterEndpoint - Removing block manager BlockManagerId(5, datanode3, 36221, None)
12:28:21.253 [dag-scheduler-event-loop] INFO  org.apache.spark.storage.BlockManagerMaster - Removed 5 successfully in removeExecutor
12:28:21.253 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Shuffle files lost for executor: 5 (epoch 0)
12:28:21.273 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Executor 5 on datanode3 killed 



12:28:37.185 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 38.0 in stage 4.0 (TID 42) in 87361 ms on datanode1 (executor 3) (38/45)
12:28:37.291 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 41.0 in stage 4.0 (TID 45) in 84764 ms on datanode2 (executor 4) (39/45)




12:28:37.525 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 37.0 in stage 4.0 (TID 41) in 88652 ms on datanode2 (executor 1) (40/45)




12:28:38.194 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 42.0 in stage 4.0 (TID 46) in 84856 ms on datanode2 (executor 4) (41/45)




12:28:40.417 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 43.0 in stage 4.0 (TID 47) in 87047 ms on datanode2 (executor 1) (42/45)




12:28:41.655 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 40.0 in stage 4.0 (TID 44) in 89547 ms on datanode1 (executor 6) (43/45)




12:28:51.447 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 39.0 in stage 4.0 (TID 43) in 100885 ms on datanode3 (executor 2) (44/45)




12:29:37.279 [spark-dynamic-executor-allocation] INFO  org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend - Requesting to kill executor(s) 3
12:29:37.280 [spark-dynamic-executor-allocation] INFO  org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend - Actual list of executor(s) to be killed is 3
12:29:37.301 [spark-dynamic-executor-allocation] INFO  org.apache.spark.ExecutorAllocationManager - Executors 3 removed due to idle timeout.
12:29:37.836 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnDriverEndpoint - Disabling executor 3.
12:29:37.837 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Executor lost: 3 (epoch 1)
12:29:37.838 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerMasterEndpoint - Trying to remove executor 3 from BlockManagerMaster.
12:29:37.838 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerMasterEndpoint - Removing bl



12:29:54.519 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 44.0 in stage 4.0 (TID 48) in 160109 ms on datanode1 (executor 6) (45/45)
12:29:54.520 [task-result-getter-0] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Removed TaskSet 4.0, whose tasks have all completed, from pool 
12:29:54.523 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - ResultStage 4 (parquet at NativeMethodAccessorImpl.java:0) finished in 446.681 s
12:29:54.524 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Job 4 is finished. Cancelling potential speculative or zombie tasks for this job
12:29:54.524 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Killing all running tasks in stage 4: Stage finished
12:29:54.525 [Thread-4] INFO  org.apache.spark.scheduler.DAGScheduler - Job 4 finished: parquet at NativeMethodAccessorImpl.java:0, took 446.694885 s
12:29:54.532 [Thread-4] INFO  o

                                                                                

12:29:54.773 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileFormatWriter - Write Job 2fce8e30-8a0b-4f49-84f0-124ad39ef127 committed. Elapsed time: 238 ms.
12:29:54.778 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileFormatWriter - Finished processing stats for write job 2fce8e30-8a0b-4f49-84f0-124ad39ef127.


In [10]:
# Verify results
result_df = spark.read.parquet(output_path)
result_df.show(10, truncate=False)
result_count = result_df.count()
print(f"Total records processed: {result_count}")

12:29:54.831 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.InMemoryFileIndex - It took 16 ms to list leaf files for 1 paths.
12:29:54.862 [Thread-4] INFO  org.apache.spark.SparkContext - Starting job: parquet at NativeMethodAccessorImpl.java:0
12:29:54.863 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Got job 5 (parquet at NativeMethodAccessorImpl.java:0) with 1 output partitions
12:29:54.863 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Final stage: ResultStage 5 (parquet at NativeMethodAccessorImpl.java:0)
12:29:54.863 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Parents of final stage: List()
12:29:54.863 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Missing parents: List()
12:29:54.864 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Submitting ResultStage 5 (MapPartitionsRDD[24] at parquet at NativeMethodAccessorImpl.java:0), 

In [11]:
sc.stop()

12:29:56.110 [Thread-4] INFO  org.apache.spark.SparkContext - SparkContext is stopping with exitCode 0.
12:29:56.131 [Thread-4] INFO  org.sparkproject.jetty.server.AbstractConnector - Stopped Spark@1768c80a{HTTP/1.1, (http/1.1)}{0.0.0.0:4040}
12:29:56.138 [Thread-4] INFO  org.apache.spark.ui.SparkUI - Stopped Spark web UI at http://namenode:4040
12:29:56.157 [YARN application state monitor] INFO  org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend - Interrupting monitor thread
12:29:56.179 [Thread-4] INFO  org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend - Shutting down all executors
12:29:56.179 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnDriverEndpoint - Asking each executor to shut down
12:29:56.187 [Thread-4] INFO  org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend - YARN client scheduler backend Stopped
12:29:56.231 [dispatcher-event-loop-1] INFO  org.apache.spark.MapOutputTrackerMasterEndp