In [2]:
# For Spark
from pyspark import SparkContext

# Spark streaming
from pyspark.streaming import StreamingContext

* Create a local `StreamingContext` with as many working **processors** as possible and a **batch interval** of 10 seconds
* `local[*]`: run Spark locally with as many working processors as logical c ores on your machine. If we want Spark to run locally with 'k' worker threads, we can specify as `local[k]`.

In [3]:
batch_interval = 10
sc = SparkContext(master="local[*]", appName = "WordCountApp")
ssc = StreamingContext(sc, batch_interval)

In [9]:
host = "localhost" 

#TCP port
port = 9999

# initialise a variable for the stram of data
lines = ssc.socketTextStream(host, port)

In [10]:
# split each line into words
words = lines.flatMap(lambda line: line.split(" "))
lines.pprint()

In [11]:
# Count each word in each batch 
pairs = words.map(lambda word: (word, 1)) 
wordCounts = pairs.reduceByKey(lambda x, y: x + y) 
# Print the counting result 
wordCounts.pprint()

In [16]:
ds1 = sc.textFile("./unit_synopsis.txt")
ds1.take(6)

['FIT9131',
 'This unit aims to provide students with the basic concepts involved in the development of well structured software using a programming language. It concentrates on the development of problem solving skills applicable to all stages of the development process. Students gain experience with the translation of a problem specification into a program design, and the implementation of that design into a programming language. The unit introduces software engineering topics such as maintainability, readability, testing, documentation, modularisation, and reasoning about correctness of programs. Students are expected to read and understand existing code as well as develop new code.',
 'FIT9132',
 'This unit will introduce the concept of data management in an organisation through relational database technology. Theoretical foundation of relational model, analysis and design, implementation of relational database using SQL will be covered.',
 'FIT5148',
 'Data engineering is about de

Difference between `map()` and `flatMap()`

In [20]:
words = ds1.map(lambda line: line.split(" ")) 
words.take(10)

[['FIT9131'],
 ['This',
  'unit',
  'aims',
  'to',
  'provide',
  'students',
  'with',
  'the',
  'basic',
  'concepts',
  'involved',
  'in',
  'the',
  'development',
  'of',
  'well',
  'structured',
  'software',
  'using',
  'a',
  'programming',
  'language.',
  'It',
  'concentrates',
  'on',
  'the',
  'development',
  'of',
  'problem',
  'solving',
  'skills',
  'applicable',
  'to',
  'all',
  'stages',
  'of',
  'the',
  'development',
  'process.',
  'Students',
  'gain',
  'experience',
  'with',
  'the',
  'translation',
  'of',
  'a',
  'problem',
  'specification',
  'into',
  'a',
  'program',
  'design,',
  'and',
  'the',
  'implementation',
  'of',
  'that',
  'design',
  'into',
  'a',
  'programming',
  'language.',
  'The',
  'unit',
  'introduces',
  'software',
  'engineering',
  'topics',
  'such',
  'as',
  'maintainability,',
  'readability,',
  'testing,',
  'documentation,',
  'modularisation,',
  'and',
  'reasoning',
  'about',
  'correctness',
  'of'

In [21]:
words = ds1.flatMap(lambda line: line.split(" ")) 
words.take(10)

['FIT9131',
 'This',
 'unit',
 'aims',
 'to',
 'provide',
 'students',
 'with',
 'the',
 'basic']

In [22]:
stopwords = ['a', 'this', 'to', 'as', 'such', 'the',
             'The', 'of', 'using', 'on', 'in', 'It',
             'with', 'and', 'or']

In [25]:
ds1 = sc.textFile("./unit_synopsis.txt")
ds1.take(5)
words = ds1.flatMap(lambda line: line.split(" "))
#words.take(10)
words_without_stopwords = words.filter(lambda x: x not in stopwords)
#words_without_stopwords.take(50)
pairs = words_without_stopwords.map(lambda word: (word, 1))
word_counts = pairs.reduceByKey(lambda x, y: x + y)
word_counts.take(10)

[('FIT9131', 1),
 ('provide', 1),
 ('basic', 1),
 ('concepts', 1),
 ('development', 3),
 ('programming', 2),
 ('concentrates', 1),
 ('solving', 1),
 ('stages', 1),
 ('Students', 2)]

In [None]:
import sys

from pyspark import SparkContext 
from pyspark.streaming import StreamingContext

sc = SparkContext.getOrCreate()

if (sc is None):
    sc = SparkContext(appName="WordCountApp")
ssc = StreamingContext(sc, 1)

host = "localhost"
port = 9999

lines = ssc.socketTextStream(host, int(port))

# Split each line into words 
words = lines.flatMap(lambda line: line.split(" "))

# Count each word in each batch 
pairs = words.map(lambda word: (word, 1)) 
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Print the result 
wordCounts.pprint()

ssc.start() 
try:
    ssc.awaitTermination(timeout=60)
except KeyboardInterrupt:
    ssc.stop()
    sc.stop()

ssc.stop()
sc.stop()