# Spark Hands-On

### Parallelize a collection

In [3]:
data = [1, 2, 3, 4, 5]

# sc is the entry point for spark, it is the spark context, and it is already available here.
# If you run your own notebook or script you need to create it.
dataRDD = sc.parallelize(data)

dataRDD.collect()

### Filtering Operations

In [5]:
def filterOperation(number):
  return number < 4

dataRDD.filter(filterOperation).collect()

### Lambda Functions

In [7]:
dataRDD.filter(lambda x: filterOperation(x)).collect()

In [8]:
dataRDD.filter(lambda x: x < 4).collect()

### Word Count

In [10]:
# This allows to download web pages
import requests
# This allows the usage of add, which is a shortcut for 'lambda x, y: x + y'
from operator import add

# This is just a way for having a list of lines from a file on the web.
# Let's remember that spark cannot process files from the web by itself.
# This is done only as an example, it cannot be done with huge files.
# Otherwise the machine may have hard times in processing it.
file_content = requests.get('https://raw.githubusercontent.com/forons/BigDataExamples/master/files/inferno.txt').iter_lines()
file = sc.parallelize(file_content)
file.collect()

# Here we split on the whitespace each element of the 'file' rdd, which is a line of the file we read, into words
words = file.flatMap(lambda x: x.split(' '))

# We map each word to a tuple with the word itself and a counter initialized to one
word_pairs = words.map(lambda x: (x, 1))

# We group the elements with the same key (the word in our case) and sum the counters
word_count = word_pairs.reduceByKey(add)

# dbfs is the file system of this notebook.
# If you want to check it, go on the left panel of this notebook and check the "data" label
# You can browse the file system.
# It creates a folder, were each element of the folder is created by a partition that saves only its data in order to not collect everything into the driver
# and save time and resources.
word_count.saveAsTextFile('/dbfs/FileStore/tables/output')

### Sort the courters in descending order

In [12]:
# x[0] access the key, and x[1] access to the value of the pair/tuple
word_count.sortBy(lambda x: x[1], ascending=False).collect()