In [1]:
import pyspark
from pyspark import SparkContext

# configure the spark context
conf = pyspark.SparkConf().setMaster('local[2]').setAppName("Spark_Macbeth")
sc = SparkContext(conf=conf)

In [2]:
# print the spark context info
sc

In [3]:
# Load the Macbeth.txt file into a spark rdd
inputFile = sc.textFile("Macbeth.txt")

# collect the first element in the rdd
inputFile.first()

'ACT I'

In [5]:
# print the type returned when we "collect" an rdd
type(inputFile.collect())

# display the first 4 items from the "collected" rdd
inputFile.collect()[: 4]

['ACT I',
 'SCENE I. A desert place.',
 '',
 'Thunder and lightning. Enter three Witches ']

Get the number of characters in each line

In [6]:
# the "non-lambda" version looks like this:
#
# def get_length(line):
#     return len(line)
# lineLengths = inputFile.map(getLength)

# same as above, but using a lambda
lineLengths = inputFile.map(lambda line: len(line))

In [7]:
# print the sum of all characters in the entire dataset
lineLengths.sum()

97398

In [8]:
# the two rdds have the same size
# i.e. lineLengths is a transform of inputFile
print(lineLengths.count())
print(inputFile.count())

4102
4102


Count the number of individual words in the file

In [10]:
def count_words(line):
    return len(line.split(' '))

count_words("A Test Line just for testing")

6

In [12]:
# transform the inputFile rdd through the count_words function
word_count_rdd = inputFile.map(count_words)

print(word_count_rdd.sum())
print(word_count_rdd.mean())

19632
4.785958069234523
