In [None]:
from pyspark.streaming import StreamingContext

In [None]:
inputFolder = "inLab11_Ex2/"
prefixOutputFolder = "out_Lab11_Ex2"

In [None]:
# Create a Spark Streaming Context object
ssc = StreamingContext(sc, 10)

In [None]:
# Set the checkpoint folder (it is needed by some window transformations)
ssc.checkpoint("checkpointfolder");

In [None]:
# Create a (Receiver) DStream that will connect to the input folder
tweetsDStream = ssc.textFileStream(inputFolder)

In [None]:
# Split the content of each input line and return only the hashtags occurring in the 
# text of the tweet.
# One pair (hashtag, +1) for each occurence of a hashtag in the analyzed line
def extractHashtags(line):
    #userId\ttext_of_the_tweet
    textTweet = line.split("\t")[1]
    
    words = textTweet.split(" ")
    
    # Hashtags
    hashtags = list(filter(lambda word: word.startswith("#"), words))
    
    pairsHashtagOne = []
    
    for hashtag in hashtags:
        pairsHashtagOne.append( (hashtag, 1) )
        
    return pairsHashtagOne

In [None]:
hashtagOneDStream = tweetsDStream.flatMap(extractHashtags)

In [None]:
# Count the number of occurrences of each (extracted) hashtag by considering only the
# last 30 seconds of data (i.e., the last 30 seconds of data of the input data stream)
# windowDuration = 30s
# slideDuration = 10s
hashtagNumOccurrencesDStream = hashtagOneDStream\
.reduceByKeyAndWindow(lambda v1, v2: v1+v2, lambda vnow, vold: vnow-vold, 30, 10)

In [None]:
# Select only the hashtags that occurred at least 100 times in the last 30 seconds
selectedHashtagsDStream = hashtagNumOccurrencesDStream.filter(lambda pair: pair[1]>=100)

In [None]:
# Sort hashtags by number of occurrences
# You must use transform because sort is not available for DStreams
sortedHashtagsDStream = selectedHashtagsDStream\
.transform(lambda batchRDD: batchRDD.sortBy(lambda pair: -1*pair[1]))

In [None]:
# Store in the output HDFS folders, sorted by number of occurrences, the pairs
# (number of occurrences , hashtag) related to the last 30 seconds of data
sortedHashtagsDStream.saveAsTextFiles(prefixOutputFolder, "")

In [None]:
# Print on the standard output the first 10 hashtags in terms of number of occurrences,
# related to the last 30 seconds of data
sortedHashtagsDStream.pprint(10)

In [None]:
#Start the computation
ssc.start()

In [None]:
#ssc.awaitTermination()

In [None]:
ssc.stop(stopSparkContext=False)