In [1]:
from pyspark.streaming import StreamingContext

# Create a local StreamingContext with two working thread and batch interval of 10 seconds
ssc = StreamingContext(sc, 10)

In [2]:
# Create a DStream that will connect to hostname:port, like localhost:9999
# You should have a socket open (in unix you can do it on the terminal with the command `nc -lk 9999'), where 9999 is the port
# You can even read from a file or from wherever you want
lines = ssc.socketTextStream("localhost", 9999)

In [3]:
# This is exactly the same wordcount we did with the normal RDDs
# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))

# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

In [4]:
# Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.pprint()

In [5]:
ssc.start()             # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate

In [6]:
# This function performs a window operation
# You should place it BEFORE the streaming context start, and if you killed the streaming context,
# than also the spark context doesn't work anymore. So you will have to run again the whole project.
# My suggestion is to replace wordCounts.pprint() with the following lines.
# Reduce last 30 seconds of data, every 10 seconds
windowedWordCounts = pairs.reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, 30, 10)
windowedWordCounts.pprint()
