In [0]:
spark

-------------- Create Spark Streaming Using RDD (low lavel languages)

In [0]:
#--------------- First Run This SparkStreamin -------------
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# When we create SparkSession and run then internally default SparkContext will create with eight working thread. Where `spark` use for work with spark sql/dataframe and `sc` use for work with spark context.
# Create a local StreamingContext with batch interval of 1 second
ssc = StreamingContext(sc, 1)

# Create a DStream that will connect to hostname:port, like localhost:9999
lines = ssc.socketTextStream("localhost", 9999)

# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))

# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.pprint()

ssc.start()             # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate



#--------------- Second Run Natcat As Data Server For Passing Data -------------
# If we have already downloaded and built Spark, you can run this example as follows. You will need to run Netcat (a small utility found in most Unix-like systems) as a data server by using

$ nc -lk 9999

------------------ Create Spark Streaming Application Using Spark Streaming API

In [0]:
# ------------ This Is An Example of Reading and Writing Static Data ---------------------
from pyspark.sql.functions import *
from pyspark.sql.types import *

# first upload one file on this path after run this application upload again and again new one file.
inputPath = "/FileStore/tables/structured-streaming/events"

# Let's first define the schema before reading the data
jsonSchema = StructType([ StructField("time", TimestampType(), True), StructField("action", StringType(), True) ])

# Using readStream instead of read on streaming data
streamingInputDF = spark.readStream\
                        .schema(jsonSchema)\
                        .option("maxFilesPerTrigger", 1)\
                        .json(inputPath)

 
# Treat a sequence of files as a stream by picking one file at a time
# groupBy the data
streamingCountsDF = streamingInputDF.groupBy(streamingInputDF.action, window(streamingInputDF.time, "1 hour")).count()


# We start a streaming computation by defining a sink and starting it. In our case, to query the counts interactively, set the completeset of 1 hour counts to be in an in-memory table.
query = (streamingCountsDF.writeStream
                        .format("memory")              # memory = store in-memory table (for testing only)
                        .queryName("counts")           # counts = name of the in-memory table
                        .outputMode("complete")        # complete = all the counts should be in the table
                        .start())

In [0]:
%fs
ls FileStore/tables/structured-streaming/events/

path,name,size,modificationTime
dbfs:/FileStore/tables/structured-streaming/events/file0.json,file0.json,479,1729752069000
dbfs:/FileStore/tables/structured-streaming/events/file1.json,file1.json,489,1729753135000
