In [1]:
sc

In [2]:
# Read the content of a log file
inputRDD = sc.textFile("log.txt")

In [3]:
# Only for debug
inputRDD.collect()

['66.249.69.97 - - [24/Sep/2014:22:25:44 +0000] "GET error”',
 '66.249.69.97 - - [24/Sep/2014:22:26:44 +0000] "GET http://www.google.com/how.html”',
 '66.249.69.97 - - [24/Sep/2014:22:28:44 +0000] "GET error error”',
 '71.19.157.179 - - [24/Sep/2014:22:30:12 +0000] "GET http://www.google.com/faq.html”',
 '66.249.69.97 - - [24/Sep/2014:31:28:44 +0000] "GET http://dbdmg.polito.it/thesis.html”']

In [4]:
# Select the rows containing the word “error”
errorsRDD = inputRDD.filter(lambda e: e.find('error')>=0)

In [5]:
# Only for debug
errorsRDD.collect()

['66.249.69.97 - - [24/Sep/2014:22:25:44 +0000] "GET error”',
 '66.249.69.97 - - [24/Sep/2014:22:28:44 +0000] "GET error error”']

In [6]:
# Create an RDD of integers. Load the values 1, 2, 3, 3 in this RDD
inputList = [1, 2, 3, 3]
inputRDD = sc.parallelize(inputList);
# Select the values greater than 2
greaterRDD = inputRDD.filter(lambda num : num>2)

In [7]:
# Only for debug
greaterRDD.collect()

[3, 3]

In [8]:
# Read the content of the input textual file
inputRDD = sc.textFile("usernames.txt")

# Compute the lengths of the input surnames
lenghtsRDD = inputRDD.map(lambda line: len(line))

In [9]:
# Only for debug
inputRDD.collect()

['PaoloG', 'LucaV', 'MartinoT']

In [10]:
# Only for debug
lenghtsRDD.collect()

[6, 5, 8]

In [11]:
# Read the content of the input textual file
inputRDD = sc.textFile("document.txt")

# Compute/identify the list of words occurring in document.txt
listOfWordsRDD = inputRDD.flatMap(lambda l: l.split(' '))

In [12]:
# Only for debug
inputRDD.collect()

['Test flatMap', 'Two lines with two occurrences of Test']

In [13]:
# Only for debug
listOfWordsRDD.collect()

['Test', 'flatMap', 'Two', 'lines', 'with', 'two', 'occurrences', 'of', 'Test']

In [14]:
inputList = [1, 2, 3, 3]
inputRDD = sc.parallelize(inputList);

In [15]:
# Exmaple of flatMap
resultRDD = inputRDD.flatMap(lambda x: list(range(x,4)))

In [16]:
# Only for debug
inputRDD.collect()

[1, 2, 3, 3]

In [17]:
# Only for debug
resultRDD.collect()

[1, 2, 3, 2, 3, 3, 3]

In [18]:
# The same function I used before inside a map instead of flatMap
resultMapRDD = inputRDD.map(lambda x: list(range(x,4)))

In [19]:
# Only for debug
# The result is a list of lists
resultMapRDD.collect()

[[1, 2, 3], [2, 3], [3], [3]]

In [20]:
# Read the content of a textual input file
inputRDD = sc.textFile("names.txt")


# Sort the content of the input RDD by name.
# Store the sorted result in a new RDD
sortedNamesRDD = inputRDD.sortBy(lambda name: name)

In [21]:
# Only for debug
inputRDD.collect()

['Paolo', 'Luca', 'John', 'Denis', 'William', 'Elizabeth', 'Carlos', 'Ernesto']

In [22]:
# Only for debug
sortedNamesRDD.collect()

['Carlos', 'Denis', 'Elizabeth', 'Ernesto', 'John', 'Luca', 'Paolo', 'William']

In [23]:
# Read the content of a textual input file
inputRDD = sc.textFile("names.txt")


# Sort the content of the input RDD by length of the names.
# Store the sorted result in a new RDD
sortedLenRDD = inputRDD.sortBy(lambda name: len(name))

In [24]:
# Only for debug
inputRDD.collect()

['Paolo', 'Luca', 'John', 'Denis', 'William', 'Elizabeth', 'Carlos', 'Ernesto']

In [25]:
# Only for debug
sortedLenRDD.collect()

['Luca', 'John', 'Paolo', 'Denis', 'Carlos', 'William', 'Ernesto', 'Elizabeth']

In [26]:
# Read the content of a textual input file
inputRDD = sc.textFile("names.txt")


# Sort by lenght and then by alphabetical order
sortedLen_AlphaRDD = inputRDD.sortBy(lambda name: (len(name), name)   )

In [27]:
# Only for debug
sortedLen_AlphaRDD.collect()

['John', 'Luca', 'Denis', 'Paolo', 'Carlos', 'Ernesto', 'William', 'Elizabeth']

In [28]:
# Read the content of the input textual file
namesRDD = sc.textFile("namesWithRepetitions.txt")

# Compute the number of occurrencies of each name
namesOccurrences = namesRDD.countByValue()

In [29]:
# Only for debug
namesRDD.collect()

['Paolo',
 'Luca',
 'John',
 'Denis',
 'William',
 'Elizabeth',
 'Carlos',
 'Ernesto',
 'Luca',
 'Paolo',
 'Carlos',
 'William',
 'William',
 'Luca',
 'Luca']

In [30]:
# Only for debug
type(namesOccurrences)

collections.defaultdict

In [31]:
namesOccurrences

defaultdict(int,
            {'Paolo': 2,
             'Luca': 4,
             'John': 1,
             'Denis': 1,
             'William': 3,
             'Elizabeth': 1,
             'Carlos': 2,
             'Ernesto': 1})

In [32]:
# Create an RDD of integers. Load the values 1, 5, 3, 4,2 in this RDD
inputList = [1, 5, 4, 4, 2]
inputRDD = sc.parallelize(inputList)

In [33]:
# Only for debug
inputRDD.collect()

[1, 5, 4, 4, 2]

In [34]:
# take(2)
takeValues = inputRDD.take(2)

In [35]:
# Only for debug
takeValues

[1, 5]

In [36]:
# Only for debug
inputRDD.collect()

[1, 5, 4, 4, 2]

In [37]:
# Retrieve the top-2 elements of the inputRDD and store them in
# a local python list
topValues = inputRDD.top(2)

In [38]:
# Only for debug
topValues

[5, 4]

In [39]:
# Create an RDD of strings. Load the values 'Paolo', 'Giovanni', 'Luca']
# in the RDD
inputList = ['Paolo', 'Giovanni', 'Luca']
inputRDD = sc.parallelize(inputList)
# Retrieve the 2 longest names of the inputRDD and store them in
# a local python list
retrievedValues = inputRDD.top(2,lambda s:len(s))

In [40]:
# Only for debug
retrievedValues

['Giovanni', 'Paolo']

In [41]:
# Create an RDD of integers. Load the values 1, 2, 3, 3 in this RDD
inputListReduce = [1, 2, 3, 3]
inputRDDReduce = sc.parallelize(inputListReduce)
# Compute the sum of the values
sumValues = inputRDDReduce.reduce(lambda e1, e2: e1+e2)

In [42]:
# Only for debug
inputRDDReduce.collect()

[1, 2, 3, 3]

In [43]:
# Create an RDD of integers. Load the values 1, 2, 3, 3 in this RDD
inputListReduce = [1, 2, 3, 3]
inputRDDReduce = sc.parallelize(inputListReduce)

In [44]:
# Compute the maximum value
maxValue = inputRDDReduce.reduce(lambda e1, e2: max(e1, e2))

In [45]:
maxValue

3

In [46]:
# Compute the maximum value using top(1)
maxValue = inputRDDReduce.top(1)

In [47]:
# Pay attention: This is a list containing one single value. reduce returns the value
maxValue

[3]