In [1]:
# Generate Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading from Sockets")
    .getOrCreate()
)

spark

25/05/22 13:08:27 INFO  SharedState:60 Setting hive.metastore.warehouse.dir ('/opt/spark/spark-warehouse/') to the value of spark.sql.warehouse.dir.
25/05/22 13:08:27 INFO  SharedState:60 Warehouse path is 'file:/opt/spark/spark-warehouse'.
25/05/22 13:08:28 WARN  SparkSession:72 Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
# Read input data

df_raw = spark.readStream.format("socket").option("host","localhost").option("port", "9999").load()

25/05/22 13:08:29 WARN  TextSocketSourceProvider:72 The socket source should not be used for production applications! It does not support recovery.
25/05/22 13:08:29 INFO  StandaloneSchedulerBackend$StandaloneDriverEndpoint:60 Registered executor NettyRpcEndpointRef(spark-client://Executor) (172.18.0.12:58624) with ID 1,  ResourceProfileId 0
25/05/22 13:08:29 INFO  StandaloneSchedulerBackend$StandaloneDriverEndpoint:60 Registered executor NettyRpcEndpointRef(spark-client://Executor) (172.18.0.13:51334) with ID 2,  ResourceProfileId 0
25/05/22 13:08:29 INFO  StandaloneSchedulerBackend$StandaloneDriverEndpoint:60 Registered executor NettyRpcEndpointRef(spark-client://Executor) (172.18.0.10:47394) with ID 0,  ResourceProfileId 0
25/05/22 13:08:29 INFO  BlockManagerMasterEndpoint:60 Registering block manager 172.18.0.12:44339 with 1048.8 MiB RAM, BlockManagerId(1, 172.18.0.12, 44339, None)
25/05/22 13:08:29 INFO  BlockManagerMasterEndpoint:60 Registering block manager 172.18.0.13:45205 wit

In [3]:
df_raw.printSchema()

root
 |-- value: string (nullable = true)



In [4]:
# Split the line into words
from pyspark.sql.functions import split

df_words = df_raw.withColumn("words", split("value", " "))

In [5]:
# Explode the list of words
from pyspark.sql.functions import explode

df_explode = df_words.withColumn("word", explode("words")).drop("value", "words")


In [6]:
# Aggregate the words to generate count
from pyspark.sql.functions import count, lit

df_agg = df_explode.groupBy("word").agg(count(lit(1)).alias("cnt"))

In [7]:
# Write the output to console streaming

df_agg.writeStream.format("console").outputMode("complete").start().awaitTermination()


25/05/22 13:13:10 INFO  StateStoreCoordinatorRef:60 Registered StateStoreCoordinator endpoint
25/05/22 13:13:10 WARN  ResolveWriteToStream:72 Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/05/22 13:13:10 INFO  ResolveWriteToStream:60 Checkpoint root file:///tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f resolved to file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f.
25/05/22 13:13:10 WARN  ResolveWriteToStream:72 spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/05/22 13:13:10 INFO  CheckpointFileManager:60 Writing atomically to file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/metadata using temp file file:/tmp/temporary-6ca37

-------------------------------------------
Batch: 0
-------------------------------------------
+----+---+
|word|cnt|
+----+---+
+----+---+



25/05/22 13:13:14 INFO  WriteToDataSourceV2Exec:60 Data source write support MicroBatchWrite[epoch: 0, writer: ConsoleWriter[numRows=20, truncate=true]] committed.
25/05/22 13:13:14 INFO  CheckpointFileManager:60 Writing atomically to file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/commits/0 using temp file file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/commits/.0.261eec87-7f24-4cb8-95f8-8ccb28f55646.tmp
25/05/22 13:13:14 INFO  CheckpointFileManager:60 Renamed temp file file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/commits/.0.261eec87-7f24-4cb8-95f8-8ccb28f55646.tmp to file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/commits/0
25/05/22 13:13:14 INFO  MicroBatchExecution:60 Streaming query made progress: {
  "id" : "60857abf-57a3-4ea2-bd64-5bbefc3250b3",
  "runId" : "725a483f-578a-4f03-aa77-bb043e1d3c7c",
  "name" : null,
  "timestamp" : "2025-05-22T13:13:10.657Z",
  "batchId" : 0,
  "numInputRows" : 0,
  "inputRowsPerSecond" : 0.0,
  "processedRows

-------------------------------------------
Batch: 1
-------------------------------------------


25/05/22 13:13:27 INFO  CodeGenerator:60 Code generated in 5.893008 ms
25/05/22 13:13:27 INFO  WriteToDataSourceV2Exec:60 Data source write support MicroBatchWrite[epoch: 1, writer: ConsoleWriter[numRows=20, truncate=true]] committed.
25/05/22 13:13:27 INFO  CheckpointFileManager:60 Writing atomically to file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/commits/1 using temp file file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/commits/.1.67fc2491-c7a4-4a14-951e-60539f6e0712.tmp
25/05/22 13:13:27 INFO  CheckpointFileManager:60 Renamed temp file file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/commits/.1.67fc2491-c7a4-4a14-951e-60539f6e0712.tmp to file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/commits/1
25/05/22 13:13:27 INFO  MicroBatchExecution:60 Streaming query made progress: {
  "id" : "60857abf-57a3-4ea2-bd64-5bbefc3250b3",
  "runId" : "725a483f-578a-4f03-aa77-bb043e1d3c7c",
  "name" : null,
  "timestamp" : "2025-05-22T13:13:24.677Z",
  "batchId" : 

+-----+---+
| word|cnt|
+-----+---+
|hello|  1|
|world|  1|
+-----+---+



25/05/22 13:13:34 INFO  CheckpointFileManager:60 Writing atomically to file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/offsets/2 using temp file file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/offsets/.2.efc6a451-ff9e-4733-877f-f30ff55bb7e0.tmp
25/05/22 13:13:34 INFO  CheckpointFileManager:60 Renamed temp file file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/offsets/.2.efc6a451-ff9e-4733-877f-f30ff55bb7e0.tmp to file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/offsets/2
25/05/22 13:13:34 INFO  MicroBatchExecution:60 Committed offsets for batch 2. Metadata OffsetSeqMetadata(0,1747919614668,Map(spark.sql.streaming.stateStore.providerClass -> org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider, spark.sql.streaming.join.stateFormatVersion -> 2, spark.sql.streaming.stateStore.compression.codec -> lz4, spark.sql.optimizer.pruneFiltersCanPruneStreamingSubplan -> false, spark.sql.streaming.stateStore.rocksdb.formatVersion -> 5, spark.sq

-------------------------------------------
Batch: 2
-------------------------------------------
+-----+---+
| word|cnt|
+-----+---+
|hello|  1|
| mine|  1|
|   is|  1|
|world|  2|
+-----+---+



25/05/22 13:13:46 INFO  MicroBatchExecution:60 Streaming query has been idle and waiting for new data more than 10000 ms.
25/05/22 13:13:48 INFO  CheckpointFileManager:60 Writing atomically to file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/offsets/3 using temp file file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/offsets/.3.d567ecfe-bc09-4c15-bbad-3c6dac7f0754.tmp
25/05/22 13:13:48 INFO  CheckpointFileManager:60 Renamed temp file file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/offsets/.3.d567ecfe-bc09-4c15-bbad-3c6dac7f0754.tmp to file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/offsets/3
25/05/22 13:13:48 INFO  MicroBatchExecution:60 Committed offsets for batch 3. Metadata OffsetSeqMetadata(0,1747919628088,Map(spark.sql.streaming.stateStore.providerClass -> org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider, spark.sql.streaming.join.stateFormatVersion -> 2, spark.sql.streaming.stateStore.compression.codec -> lz4, spark.sql.op

-------------------------------------------
Batch: 3
-------------------------------------------
+-----+---+
| word|cnt|
+-----+---+
|  you|  1|
|hello|  1|
| mine|  1|
|   is|  1|
|thank|  1|
|world|  2|
+-----+---+



25/05/22 13:13:49 INFO  CheckpointFileManager:60 Renamed temp file file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/commits/.3.05079aaa-17d5-4a7b-88f3-1262cb464456.tmp to file:/tmp/temporary-6ca37557-fc11-49f4-9f16-c4a714cbfc1f/commits/3
25/05/22 13:13:49 INFO  MicroBatchExecution:60 Streaming query made progress: {
  "id" : "60857abf-57a3-4ea2-bd64-5bbefc3250b3",
  "runId" : "725a483f-578a-4f03-aa77-bb043e1d3c7c",
  "name" : null,
  "timestamp" : "2025-05-22T13:13:48.088Z",
  "batchId" : 3,
  "numInputRows" : 1,
  "inputRowsPerSecond" : 90.90909090909092,
  "processedRowsPerSecond" : 0.6042296072507553,
  "durationMs" : {
    "addBatch" : 1598,
    "commitOffsets" : 21,
    "getBatch" : 0,
    "latestOffset" : 0,
    "queryPlanning" : 16,
    "triggerExecution" : 1655,
    "walCommit" : 19
  },
  "stateOperators" : [ {
    "operatorName" : "stateStoreSave",
    "numRowsTotal" : 6,
    "numRowsUpdated" : 2,
    "allUpdatesTimeMs" : 477,
    "numRowsRemoved" : 0,
    "allRemoval

-------------------------------------------
Batch: 4
-------------------------------------------
+-----+---+
| word|cnt|
+-----+---+
|  you|  1|
|hello|  1|
| mine|  1|
|   is|  1|
| exit|  1|
|thank|  1|
|world|  2|
+-----+---+



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

25/05/22 13:14:01 INFO  MicroBatchExecution:60 Streaming query has been idle and waiting for new data more than 10000 ms.
