In [1]:
# Import required libraries
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import sys

In [2]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Sentiment Calculation UDF") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "3") \
    .config("spark.dynamicAllocation.maxExecutors", "9") \
    .config("spark.dynamicAllocation.initialExecutors", "3") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.extraPythonPackages", "pandas,vaderSentiment") \
    .config("spark.driver.extraPythonPackages", "pandas,vaderSentiment") \
    .config("spark.executorEnv.PYTHONPATH", ":".join(sys.path)) \
    .getOrCreate()

# Set log level for cleaner outputs
sc = spark.sparkContext
sc.setLogLevel("INFO")


In [3]:
# HDFS path for the dataset
file_path = "hdfs://namenode:9000/data/cleaned_dataset.parquet"

# Read the Parquet file into a DataFrame
df = spark.read.parquet(file_path)

# Display schema and sample rows
df.printSchema()
df.show(5, truncate=False)

12:04:36.723 [Thread-4] INFO  org.apache.spark.sql.internal.SharedState - Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
12:04:36.730 [Thread-4] INFO  org.apache.spark.sql.internal.SharedState - Warehouse path is 'file:/home/ubuntu/project/cluster-notebooks/spark-warehouse'.
12:04:36.740 [Thread-4] INFO  org.apache.spark.ui.ServerInfo - Adding filter to /SQL: org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter
12:04:36.742 [Thread-4] INFO  org.sparkproject.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@164a6dac{/SQL,null,AVAILABLE,@Spark}
12:04:36.742 [Thread-4] INFO  org.apache.spark.ui.ServerInfo - Adding filter to /SQL/json: org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter
12:04:36.743 [Thread-4] INFO  org.sparkproject.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@2a7b5f23{/SQL/json,null,AVAILABLE,@Spark}
12:04:36.743 [Thread-4] INFO  org.apache.spark.ui.ServerInfo - A

[Stage 0:>                                                          (0 + 1) / 1]

12:04:39.901 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 0.0 in stage 0.0 (TID 0) in 1968 ms on datanode3 (executor 1) (1/1)
12:04:39.903 [task-result-getter-0] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Removed TaskSet 0.0, whose tasks have all completed, from pool 
12:04:39.908 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - ResultStage 0 (parquet at NativeMethodAccessorImpl.java:0) finished in 2.095 s
12:04:39.912 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Job 0 is finished. Cancelling potential speculative or zombie tasks for this job
12:04:39.913 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Killing all running tasks in stage 0: Stage finished
12:04:39.915 [Thread-4] INFO  org.apache.spark.scheduler.DAGScheduler - Job 0 finished: parquet at NativeMethodAccessorImpl.java:0, took 2.191466 s


                                                                                

12:04:40.160 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Removed broadcast_0_piece0 on namenode:37805 in memory (size: 46.9 KiB, free: 2004.6 MiB)
12:04:40.170 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Removed broadcast_0_piece0 on datanode3:42353 in memory (size: 46.9 KiB, free: 912.3 MiB)
root
 |-- link: string (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- score: long (nullable = true)
 |-- sub_reddit: string (nullable = true)
 |-- post_id: string (nullable = true)
 |-- comment_id: string (nullable = true)
 |-- body: string (nullable = true)

12:04:41.022 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileSourceStrategy - Pushed Filters: 
12:04:41.023 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileSourceStrategy - Post-Scan Filters: 
12:04:41.408 [Thread-4] INFO  org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - Code generated in 230.862231 ms


[Stage 1:>                                                          (0 + 1) / 1]

12:04:42.804 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 0.0 in stage 1.0 (TID 1) in 1192 ms on datanode3 (executor 1) (1/1)
12:04:42.804 [task-result-getter-1] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Removed TaskSet 1.0, whose tasks have all completed, from pool 
12:04:42.805 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - ResultStage 1 (showString at NativeMethodAccessorImpl.java:0) finished in 1.241 s
12:04:42.806 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Job 1 is finished. Cancelling potential speculative or zombie tasks for this job
12:04:42.806 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Killing all running tasks in stage 1: Stage finished
12:04:42.806 [Thread-4] INFO  org.apache.spark.scheduler.DAGScheduler - Job 1 finished: showString at NativeMethodAccessorImpl.java:0, took 1.268451 s


                                                                                

12:04:43.817 [Thread-4] INFO  org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - Code generated in 23.704362 ms
+----------------------------------------------------------------------------------------------------------------+-----------+-----+------------+-------+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|link                                                          

In [4]:
# Function to clean comments
def clean_comment_spark(df, column):
    """Clean comments in the specified column."""
    return df.withColumn(
        f"{column}_clean",
        F.trim(
            F.regexp_replace(
                F.regexp_replace(
                    F.regexp_replace(
                        F.lower(F.col(column)),  # Convert to lowercase
                        r"http\S+|www\S+|https\S+", ""),  # Remove URLs
                    r"@\w+|#", ""),  # Remove mentions and hashtags
                r"[^\w\s]", ""),  # Remove special characters and punctuation
            )
        )


In [5]:
# Clean the comments and select relevant columns
df = clean_comment_spark(df, "body").select("comment_id", "body_clean")
df.show(5, truncate=False)

12:04:44.032 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileSourceStrategy - Pushed Filters: 
12:04:44.033 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileSourceStrategy - Post-Scan Filters: 
12:04:44.119 [Thread-4] INFO  org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - Code generated in 55.662525 ms
12:04:44.125 [Thread-4] INFO  org.apache.spark.storage.memory.MemoryStore - Block broadcast_3 stored as values in memory (estimated size 408.2 KiB, free 2003.7 MiB)
12:04:44.140 [Thread-4] INFO  org.apache.spark.storage.memory.MemoryStore - Block broadcast_3_piece0 stored as bytes in memory (estimated size 44.9 KiB, free 2003.7 MiB)
12:04:44.141 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_3_piece0 in memory on namenode:37805 (size: 44.9 KiB, free: 2004.5 MiB)
12:04:44.144 [Thread-4] INFO  org.apache.spark.SparkContext - Created broadcast 3 from showString at NativeMethodAccessorImpl.java

[Stage 2:>                                                          (0 + 1) / 1]

12:04:45.299 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_3_piece0 in memory on datanode1:41313 (size: 44.9 KiB, free: 912.2 MiB)
12:04:45.459 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Removed broadcast_2_piece0 on namenode:37805 in memory (size: 6.9 KiB, free: 2004.5 MiB)
12:04:45.463 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Removed broadcast_2_piece0 on datanode3:42353 in memory (size: 6.9 KiB, free: 912.3 MiB)
12:04:45.473 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Removed broadcast_1_piece0 on namenode:37805 in memory (size: 45.0 KiB, free: 2004.5 MiB)
12:04:45.476 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Removed broadcast_1_piece0 on datanode3:42353 in memory (size: 45.0 KiB, free: 912.3 MiB)
12:04:46.837 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetMan

                                                                                

In [6]:
# Create and broadcast the SentimentIntensityAnalyzer
analyzer_broadcast = sc.broadcast(SentimentIntensityAnalyzer())

12:04:46.920 [Thread-4] INFO  org.apache.spark.storage.memory.MemoryStore - Block broadcast_5 stored as values in memory (estimated size 312.0 B, free 2004.1 MiB)
12:04:46.931 [Thread-4] INFO  org.apache.spark.storage.memory.MemoryStore - Block broadcast_5_piece0 stored as bytes in memory (estimated size 379.2 KiB, free 2003.8 MiB)
12:04:46.933 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_5_piece0 in memory on namenode:37805 (size: 379.2 KiB, free: 2004.2 MiB)
12:04:46.934 [Thread-4] INFO  org.apache.spark.SparkContext - Created broadcast 5 from broadcast at NativeMethodAccessorImpl.java:0


In [7]:
# Define a UDF for sentiment analysis
def calculate_sentiment_udf(text):
    analyzer = analyzer_broadcast.value
    if text:
        return analyzer.polarity_scores(text)['compound']
    else:
        return None

In [8]:
# Register the UDF
sentiment_udf = F.udf(calculate_sentiment_udf, FloatType())

# Apply the UDF to calculate sentiment
df = df.withColumn("sentiment", sentiment_udf(F.col("body_clean")))

In [9]:
# Write the results to HDFS in Parquet format
output_path = "hdfs://namenode:9000/data/results/comment_sentiment.parquet"
df.select("comment_id", "sentiment").write.mode("overwrite").parquet(output_path)

12:04:47.143 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileSourceStrategy - Pushed Filters: 
12:04:47.143 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileSourceStrategy - Post-Scan Filters: 
12:04:47.199 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.parquet.ParquetUtils - Using default output committer for Parquet: org.apache.parquet.hadoop.ParquetOutputCommitter
12:04:47.216 [Thread-4] INFO  org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - File Output Committer Algorithm version is 1
12:04:47.216 [Thread-4] INFO  org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
12:04:47.216 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol - Using user defined output committer class org.apache.parquet.hadoop.ParquetOutputCommitter
12:04:47.217 [Thread-4] INFO  org.apache.hadoop

[Stage 3:>                                                         (0 + 6) / 45]

12:04:48.199 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_5_piece0 in memory on datanode3:42353 (size: 379.2 KiB, free: 911.8 MiB)
12:04:48.221 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_5_python on disk on datanode3:42353 (size: 822.8 KiB)
12:04:48.243 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_6_piece0 in memory on datanode3:42353 (size: 44.9 KiB, free: 911.8 MiB)
12:04:48.386 [spark-dynamic-executor-allocation] INFO  org.apache.spark.ExecutorAllocationManager - Requesting 1 new executor because tasks are backlogged (new desired total will be 4 for resource profile id: 0)
12:04:48.402 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_5_piece0 in memory on datanode1:41313 (size: 379.2 KiB, free: 911.8 MiB)
12:04:48.435 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.stor

[Stage 3:>                                                         (0 + 8) / 45]

12:04:52.275 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_7_piece0 in memory on datanode2:35337 (size: 104.9 KiB, free: 912.2 MiB)
12:04:53.495 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnDriverEndpoint - Registered executor NettyRpcEndpointRef(spark-client://Executor) (192.168.12.118:39120) with ID 6,  ResourceProfileId 0
12:04:53.497 [spark-listener-group-executorManagement] INFO  org.apache.spark.scheduler.dynalloc.ExecutorMonitor - New executor 6 has registered (new total is 5)
12:04:53.583 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerMasterEndpoint - Registering block manager datanode3:40487 with 912.3 MiB RAM, BlockManagerId(6, datanode3, 40487, None)
12:04:53.637 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 8.0 in stage 3.0 (TID 11) (datanode3, executor 6, partition 8, NODE_LOCAL, 9690 

[Stage 3:>                                                        (0 + 12) / 45]

12:04:54.034 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_7_piece0 in memory on datanode3:40487 (size: 104.9 KiB, free: 912.2 MiB)
12:04:54.232 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_7_piece0 in memory on datanode1:35917 (size: 104.9 KiB, free: 912.2 MiB)
12:04:54.815 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_5_piece0 in memory on datanode2:35337 (size: 379.2 KiB, free: 911.8 MiB)
12:04:54.873 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_5_python on disk on datanode2:35337 (size: 822.8 KiB)
12:04:55.082 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_6_piece0 in memory on datanode2:35337 (size: 44.9 KiB, free: 911.8 MiB)
12:04:56.349 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added

[Stage 3:>                                                        (0 + 12) / 45]

12:06:25.200 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 12.0 in stage 3.0 (TID 15) (datanode2, executor 2, partition 12, NODE_LOCAL, 9690 bytes) 
12:06:25.208 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 5.0 in stage 3.0 (TID 8) in 97841 ms on datanode2 (executor 2) (1/45)
12:06:25.212 [dag-scheduler-event-loop] INFO  org.apache.spark.api.python.PythonAccumulatorV2 - Connected to AccumulatorServer at host: 127.0.0.1 port: 60387


[Stage 3:=>                                                       (1 + 12) / 45]

12:06:25.966 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 13.0 in stage 3.0 (TID 16) (datanode1, executor 3, partition 13, NODE_LOCAL, 9690 bytes) 
12:06:25.967 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 4.0 in stage 3.0 (TID 7) in 98601 ms on datanode1 (executor 3) (2/45)


[Stage 3:==>                                                      (2 + 12) / 45]

12:06:27.042 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 14.0 in stage 3.0 (TID 17) (datanode2, executor 2, partition 14, NODE_LOCAL, 9690 bytes) 
12:06:27.042 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 2.0 in stage 3.0 (TID 5) in 99680 ms on datanode2 (executor 2) (3/45)


[Stage 3:===>                                                     (3 + 12) / 45]

12:06:27.992 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 15.0 in stage 3.0 (TID 18) (datanode3, executor 1, partition 15, NODE_LOCAL, 9690 bytes) 
12:06:27.993 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 0.0 in stage 3.0 (TID 3) in 100632 ms on datanode3 (executor 1) (4/45)


[Stage 3:=====>                                                   (4 + 12) / 45]

12:06:28.647 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 16.0 in stage 3.0 (TID 19) (datanode1, executor 3, partition 16, NODE_LOCAL, 9690 bytes) 
12:06:28.648 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 1.0 in stage 3.0 (TID 4) in 101287 ms on datanode1 (executor 3) (5/45)




12:06:30.243 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 17.0 in stage 3.0 (TID 20) (datanode2, executor 4, partition 17, NODE_LOCAL, 9690 bytes) 
12:06:30.244 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 7.0 in stage 3.0 (TID 10) in 98316 ms on datanode2 (executor 4) (6/45)




12:06:30.967 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 18.0 in stage 3.0 (TID 21) (datanode3, executor 1, partition 18, NODE_LOCAL, 9690 bytes) 
12:06:30.968 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 3.0 in stage 3.0 (TID 6) in 103606 ms on datanode3 (executor 1) (7/45)




12:06:31.849 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 19.0 in stage 3.0 (TID 22) (datanode2, executor 4, partition 19, NODE_LOCAL, 9690 bytes) 
12:06:31.850 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 6.0 in stage 3.0 (TID 9) in 99926 ms on datanode2 (executor 4) (8/45)




12:06:33.903 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 20.0 in stage 3.0 (TID 23) (datanode3, executor 6, partition 20, NODE_LOCAL, 9690 bytes) 
12:06:33.904 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 8.0 in stage 3.0 (TID 11) in 100267 ms on datanode3 (executor 6) (9/45)
12:06:33.907 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 21.0 in stage 3.0 (TID 24) (datanode1, executor 5, partition 21, NODE_LOCAL, 9690 bytes) 
12:06:33.908 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 11.0 in stage 3.0 (TID 14) in 100014 ms on datanode1 (executor 5) (10/45)




12:06:34.908 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 22.0 in stage 3.0 (TID 25) (datanode3, executor 6, partition 22, NODE_LOCAL, 9690 bytes) 
12:06:34.909 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 9.0 in stage 3.0 (TID 12) in 101272 ms on datanode3 (executor 6) (11/45)




12:06:36.969 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 23.0 in stage 3.0 (TID 26) (datanode1, executor 5, partition 23, NODE_LOCAL, 9690 bytes) 
12:06:36.970 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 10.0 in stage 3.0 (TID 13) in 103077 ms on datanode1 (executor 5) (12/45)




12:07:57.393 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 24.0 in stage 3.0 (TID 27) (datanode2, executor 2, partition 24, NODE_LOCAL, 9690 bytes) 
12:07:57.395 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 12.0 in stage 3.0 (TID 15) in 92196 ms on datanode2 (executor 2) (13/45)




12:08:02.101 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 25.0 in stage 3.0 (TID 28) (datanode2, executor 2, partition 25, NODE_LOCAL, 9690 bytes) 
12:08:02.101 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 14.0 in stage 3.0 (TID 17) in 95060 ms on datanode2 (executor 2) (14/45)




12:08:02.801 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 26.0 in stage 3.0 (TID 29) (datanode1, executor 3, partition 26, NODE_LOCAL, 9690 bytes) 
12:08:02.802 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 16.0 in stage 3.0 (TID 19) in 94155 ms on datanode1 (executor 3) (15/45)




12:08:04.100 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 27.0 in stage 3.0 (TID 30) (datanode3, executor 1, partition 27, NODE_LOCAL, 9690 bytes) 
12:08:04.103 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 15.0 in stage 3.0 (TID 18) in 96111 ms on datanode3 (executor 1) (16/45)




12:08:05.846 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 28.0 in stage 3.0 (TID 31) (datanode3, executor 1, partition 28, NODE_LOCAL, 9690 bytes) 
12:08:05.847 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 18.0 in stage 3.0 (TID 21) in 94880 ms on datanode3 (executor 1) (17/45)




12:08:06.182 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 29.0 in stage 3.0 (TID 32) (datanode2, executor 4, partition 29, NODE_LOCAL, 9690 bytes) 
12:08:06.183 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 17.0 in stage 3.0 (TID 20) in 95940 ms on datanode2 (executor 4) (18/45)




12:08:06.887 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 30.0 in stage 3.0 (TID 33) (datanode1, executor 3, partition 30, NODE_LOCAL, 9690 bytes) 
12:08:06.888 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 13.0 in stage 3.0 (TID 16) in 100923 ms on datanode1 (executor 3) (19/45)




12:08:07.524 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 31.0 in stage 3.0 (TID 34) (datanode1, executor 5, partition 31, NODE_LOCAL, 9690 bytes) 
12:08:07.525 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 21.0 in stage 3.0 (TID 24) in 93619 ms on datanode1 (executor 5) (20/45)




12:08:08.711 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 32.0 in stage 3.0 (TID 35) (datanode3, executor 6, partition 32, NODE_LOCAL, 9690 bytes) 
12:08:08.713 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 22.0 in stage 3.0 (TID 25) in 93806 ms on datanode3 (executor 6) (21/45)




12:08:12.609 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 33.0 in stage 3.0 (TID 36) (datanode3, executor 6, partition 33, NODE_LOCAL, 9690 bytes) 
12:08:12.610 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 20.0 in stage 3.0 (TID 23) in 98708 ms on datanode3 (executor 6) (22/45)




12:08:13.092 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 19.0 in stage 3.0 (TID 22) in 101242 ms on datanode2 (executor 4) (23/45)
12:08:13.093 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 34.0 in stage 3.0 (TID 37) (datanode2, executor 4, partition 34, NODE_LOCAL, 9690 bytes) 




12:08:13.473 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 35.0 in stage 3.0 (TID 38) (datanode1, executor 5, partition 35, NODE_LOCAL, 9690 bytes) 
12:08:13.475 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 23.0 in stage 3.0 (TID 26) in 96506 ms on datanode1 (executor 5) (24/45)




12:09:31.797 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 36.0 in stage 3.0 (TID 39) (datanode2, executor 2, partition 36, NODE_LOCAL, 9690 bytes) 
12:09:31.800 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 24.0 in stage 3.0 (TID 27) in 94407 ms on datanode2 (executor 2) (25/45)




12:09:36.129 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 37.0 in stage 3.0 (TID 40) (datanode2, executor 2, partition 37, NODE_LOCAL, 9690 bytes) 
12:09:36.130 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 25.0 in stage 3.0 (TID 28) in 94030 ms on datanode2 (executor 2) (26/45)




12:09:38.088 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 38.0 in stage 3.0 (TID 41) (datanode1, executor 5, partition 38, NODE_LOCAL, 9690 bytes) 
12:09:38.089 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 31.0 in stage 3.0 (TID 34) in 90566 ms on datanode1 (executor 5) (27/45)




12:09:38.938 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 39.0 in stage 3.0 (TID 42) (datanode2, executor 4, partition 39, NODE_LOCAL, 9690 bytes) 
12:09:38.940 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 29.0 in stage 3.0 (TID 32) in 92757 ms on datanode2 (executor 4) (28/45)
12:09:39.088 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 40.0 in stage 3.0 (TID 43) (datanode3, executor 1, partition 40, NODE_LOCAL, 9690 bytes) 
12:09:39.090 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 28.0 in stage 3.0 (TID 31) in 93244 ms on datanode3 (executor 1) (29/45)




12:09:40.871 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 41.0 in stage 3.0 (TID 44) (datanode3, executor 1, partition 41, NODE_LOCAL, 9690 bytes) 
12:09:40.872 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 27.0 in stage 3.0 (TID 30) in 96771 ms on datanode3 (executor 1) (30/45)




12:09:42.670 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 42.0 in stage 3.0 (TID 45) (datanode3, executor 6, partition 42, NODE_LOCAL, 9690 bytes) 
12:09:42.671 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 33.0 in stage 3.0 (TID 36) in 90063 ms on datanode3 (executor 6) (31/45)
12:09:42.773 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 43.0 in stage 3.0 (TID 46) (datanode1, executor 5, partition 43, NODE_LOCAL, 9690 bytes) 
12:09:42.775 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 35.0 in stage 3.0 (TID 38) in 89302 ms on datanode1 (executor 5) (32/45)




12:09:43.944 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.TaskSetManager - Starting task 44.0 in stage 3.0 (TID 47) (datanode1, executor 3, partition 44, NODE_LOCAL, 9864 bytes) 
12:09:43.946 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 30.0 in stage 3.0 (TID 33) in 97059 ms on datanode1 (executor 3) (33/45)
12:09:44.033 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 34.0 in stage 3.0 (TID 37) in 90940 ms on datanode2 (executor 4) (34/45)




12:09:45.007 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 26.0 in stage 3.0 (TID 29) in 102207 ms on datanode1 (executor 3) (35/45)




12:09:46.737 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 32.0 in stage 3.0 (TID 35) in 98026 ms on datanode3 (executor 6) (36/45)




12:10:54.478 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 36.0 in stage 3.0 (TID 39) in 82682 ms on datanode2 (executor 2) (37/45)




12:11:01.511 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 37.0 in stage 3.0 (TID 40) in 85382 ms on datanode2 (executor 2) (38/45)




12:11:02.827 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 38.0 in stage 3.0 (TID 41) in 84740 ms on datanode1 (executor 5) (39/45)




12:11:03.472 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 41.0 in stage 3.0 (TID 44) in 82602 ms on datanode3 (executor 1) (40/45)




12:11:05.726 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 40.0 in stage 3.0 (TID 43) in 86638 ms on datanode3 (executor 1) (41/45)




12:11:06.149 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 42.0 in stage 3.0 (TID 45) in 83479 ms on datanode3 (executor 6) (42/45)




12:11:07.391 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 43.0 in stage 3.0 (TID 46) in 84618 ms on datanode1 (executor 5) (43/45)




12:11:22.107 [task-result-getter-2] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 39.0 in stage 3.0 (TID 42) in 103169 ms on datanode2 (executor 4) (44/45)




12:12:01.519 [spark-dynamic-executor-allocation] INFO  org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend - Requesting to kill executor(s) 2
12:12:01.523 [spark-dynamic-executor-allocation] INFO  org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend - Actual list of executor(s) to be killed is 2
12:12:01.566 [spark-dynamic-executor-allocation] INFO  org.apache.spark.ExecutorAllocationManager - Executors 2 removed due to idle timeout.
12:12:03.598 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnDriverEndpoint - Disabling executor 2.
12:12:03.607 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Executor lost: 2 (epoch 0)
12:12:03.610 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerMasterEndpoint - Trying to remove executor 2 from BlockManagerMaster.
12:12:03.611 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerMasterEndpoint - Removing bl



12:12:25.252 [task-result-getter-3] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 44.0 in stage 3.0 (TID 47) in 161309 ms on datanode1 (executor 3) (45/45)
12:12:25.252 [task-result-getter-3] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Removed TaskSet 3.0, whose tasks have all completed, from pool 
12:12:25.255 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - ResultStage 3 (parquet at NativeMethodAccessorImpl.java:0) finished in 457.929 s
12:12:25.256 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Job 3 is finished. Cancelling potential speculative or zombie tasks for this job
12:12:25.256 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Killing all running tasks in stage 3: Stage finished
12:12:25.259 [Thread-4] INFO  org.apache.spark.scheduler.DAGScheduler - Job 3 finished: parquet at NativeMethodAccessorImpl.java:0, took 457.959577 s
12:12:25.267 [Thread-4] INFO  o

                                                                                

12:12:25.465 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileFormatWriter - Write Job 12bc96c1-6666-4d0b-b5b6-474bfd07717c committed. Elapsed time: 196 ms.
12:12:25.474 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileFormatWriter - Finished processing stats for write job 12bc96c1-6666-4d0b-b5b6-474bfd07717c.


In [10]:
# Read back the results and display a preview
result_df = spark.read.parquet(output_path)
result_df.show(10, truncate=False)
result_count = result_df.count()
print(f"Total records processed: {result_count}")

12:12:25.532 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.InMemoryFileIndex - It took 13 ms to list leaf files for 1 paths.
12:12:25.562 [Thread-4] INFO  org.apache.spark.SparkContext - Starting job: parquet at NativeMethodAccessorImpl.java:0
12:12:25.563 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Got job 4 (parquet at NativeMethodAccessorImpl.java:0) with 1 output partitions
12:12:25.563 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Final stage: ResultStage 4 (parquet at NativeMethodAccessorImpl.java:0)
12:12:25.563 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Parents of final stage: List()
12:12:25.563 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Missing parents: List()
12:12:25.564 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Submitting ResultStage 4 (MapPartitionsRDD[18] at parquet at NativeMethodAccessorImpl.java:0), 

In [11]:
sc.stop()

12:12:26.863 [Thread-4] INFO  org.apache.spark.SparkContext - SparkContext is stopping with exitCode 0.
12:12:26.875 [Thread-4] INFO  org.sparkproject.jetty.server.AbstractConnector - Stopped Spark@367c2f5d{HTTP/1.1, (http/1.1)}{0.0.0.0:4040}
12:12:26.878 [Thread-4] INFO  org.apache.spark.ui.SparkUI - Stopped Spark web UI at http://namenode:4040
12:12:26.885 [YARN application state monitor] INFO  org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend - Interrupting monitor thread
12:12:26.896 [Thread-4] INFO  org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend - Shutting down all executors
12:12:26.897 [dispatcher-CoarseGrainedScheduler] INFO  org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnDriverEndpoint - Asking each executor to shut down
12:12:26.901 [Thread-4] INFO  org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend - YARN client scheduler backend Stopped
12:12:26.941 [dispatcher-event-loop-0] INFO  org.apache.spark.MapOutputTrackerMasterEndp