# Spark Example

This is a first tutorial on apache spark

sc is here Spark Context object. You do not need to create the "sc" object, it is already loaded into memory. 

In [1]:
from pyspark import SparkContext
sc = SparkContext()
print(sc.version)
print(sc.pythonVer)
print(sc.master)

3.0.0
3.8
local[*]


In [2]:
# We create here an example text data. 
a = ["What Will It Take for BU Commuters to Leave Their Cars for the MBTA? University boosts T pass subsidies to cover half the cost, raises parking fees, all part of broader strategy to build a greener BU"]

In [3]:
print(a)

['What Will It Take for BU Commuters to Leave Their Cars for the MBTA? University boosts T pass subsidies to cover half the cost, raises parking fees, all part of broader strategy to build a greener BU']


In [4]:
# Now, we can go ahead and parallize it, i.e., load it in a distributed data structure as RDD
rdd = sc.parallelize(a)

# sc is the spark context and it is already loaded 
# When you write spark pyscripts you need to create it. 

In [5]:
# lines = sc.textFile(sys.argv[1], 1)
words = rdd.flatMap(lambda x: x.split(' '))



In [6]:
words.take(10)

['What',
 'Will',
 'It',
 'Take',
 'for',
 'BU',
 'Commuters',
 'to',
 'Leave',
 'Their']

In [7]:
counts = rdd.flatMap(lambda x: x.split(' ')) \
.map(lambda x: (x, 1)) \
.reduceByKey(lambda x, y: x+y)

counts.collect()

[('What', 1),
 ('Will', 1),
 ('It', 1),
 ('Cars', 1),
 ('of', 1),
 ('broader', 1),
 ('greener', 1),
 ('to', 3),
 ('Their', 1),
 ('the', 2),
 ('MBTA?', 1),
 ('T', 1),
 ('subsidies', 1),
 ('cost,', 1),
 ('parking', 1),
 ('fees,', 1),
 ('part', 1),
 ('build', 1),
 ('a', 1),
 ('University', 1),
 ('pass', 1),
 ('Take', 1),
 ('for', 2),
 ('BU', 2),
 ('Commuters', 1),
 ('Leave', 1),
 ('boosts', 1),
 ('cover', 1),
 ('half', 1),
 ('raises', 1),
 ('all', 1),
 ('strategy', 1)]

In [8]:
counts.top(1)

[('to', 3)]

In [9]:
# top ()

# top(num, key=None)[source]
# Get the top N elements from an RDD.

In [10]:
sc.parallelize([10, 4, 2, 12, 3]).top(1)

[12]

In [11]:
sc.parallelize([2, 3, 4, 5, 6], 2).top(2)

[6, 5]

In [12]:
sc.parallelize([10, 4, 2, 12, 3]).top(3, key=str)

[4, 3, 2]

In [13]:
rdd1=sc.parallelize([('a', 10), ('c',4), ('z', 2), ('d', 12)])

In [14]:
rdd1.top(3, key=lambda x: x[1])

[('d', 12), ('a', 10), ('c', 4)]

In [15]:
rdd1.top(3, key=lambda x: x[0])

[('z', 2), ('d', 12), ('c', 4)]

In [16]:
rdd1.top(3, lambda x: x[0])

[('z', 2), ('d', 12), ('c', 4)]

In [17]:
rdd1.top(3)

[('z', 2), ('d', 12), ('c', 4)]

In [18]:
rdd1.top(3, str)

[('z', 2), ('d', 12), ('c', 4)]