# MapReduce

## ... for pedestrians

In [79]:
def square(x):
    return x**2

In [80]:
map(square, range(10))

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [81]:
[x**2 for x in range(10)]

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [82]:
map(lambda x: x**2, range(10))

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [83]:
def my_add(x, y):
    return x + y

In [84]:
reduce(my_add, range(10))

45

## My first MapReduce

In [85]:
reduce(my_add, map(square, range(10)))

285

### but what about the keys?

In [88]:
pairs = map(lambda x: (x % 2 == 0, x**2 + x), range(10))
pairs

[(True, 0),
 (False, 2),
 (True, 6),
 (False, 12),
 (True, 20),
 (False, 30),
 (True, 42),
 (False, 56),
 (True, 72),
 (False, 90)]

In [87]:
from collections import defaultdict
def reduceByKey(reduce_fn, iterable):
    chunks = defaultdict(list)
    for k, v in iterable:
        chunks[k].append(v)
    for k in chunks:
        chunks[k] = reduce(reduce_fn, chunks[k])
    return chunks

In [89]:
reduceByKey(my_add, pairs)

defaultdict(list, {False: 190, True: 140})

## Staring Spark Locally

    PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS=notebook ~/src/spark-2.1.0-bin-hadoop2.7/bin/pyspark 

## The spark context

In [1]:
sc

<pyspark.context.SparkContext at 0x10d4bc790>

## Baby steps

In [48]:
RDD = sc.parallelize([1,2,3,4,5])

In [49]:
RDD

ParallelCollectionRDD[36] at parallelize at PythonRDD.scala:475

In [50]:
RDD.collect()

[1, 2, 3, 4, 5]

In [51]:
RDD.count()

5

In [52]:
RDD.map(lambda x: x**2).collect()

[1, 4, 9, 16, 25]

## Loading data

In [1]:
wine = sc.textFile('data/winequality-white.csv')

In [2]:
wine.take(5)

[u'"fixed acidity";"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"',
 u'7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6',
 u'6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9.5;6',
 u'8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;10.1;6',
 u'7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4;9.9;6']

In [4]:
import requests
from bs4 import BeautifulSoup

In [5]:
obama_url = 'https://en.wikipedia.org/wiki/Barack_Obama'
obama_soup = BeautifulSoup(requests.get(obama_url).text, 'lxml')
for unwanted in obama_soup(['script', 'style']):
    unwanted.decompose()
obama_text = obama_soup.get_text()

In [6]:
obamaRDD = sc.parallelize(obama_text.split())

In [7]:
obamaRDD.take(5)

[u'Barack', u'Obama', u'-', u'Wikipedia', u'Barack']

## Basic RDD operations

In [14]:
word_lengths = obamaRDD.map(lambda x: len(x))

In [10]:
word_lengths.take(5)

[6, 5, 1, 9, 6]

In [11]:
import re

In [12]:
clean_RDD = obamaRDD.filter(lambda x: re.match('\w+', x))

In [13]:
clean_RDD.take(5)

[u'Barack', u'Obama', u'Wikipedia', u'Barack', u'Obama']

In [15]:
word_lengths = clean_RDD.map(lambda x: len(x))

In [17]:
word_count = word_lengths.count()
word_count

29350

In [18]:
text_length = word_lengths.reduce(lambda x, y: x + y)

In [19]:
text_length

163793

In [20]:
# mean word lenght
text_length / float(word_count)

5.580681431005111

In [21]:
from operator import add

In [22]:
add(1,2)

3

In [23]:
word_lengths.reduce(add)

163793

In [24]:
word_counts = (
    obamaRDD
      .map(lambda x: (x, 1))
      .reduceByKey(add))

In [26]:
word_counts.take(5)

[(u'ivy-covered', 1),
 (u'reform,', 2),
 (u'interference', 1),
 (u'hope".', 1),
 (u'Abbas', 2)]

In [27]:
word_counts.takeOrdered(5, key=lambda (_, v): -v)

[(u'the', 1079),
 (u'of', 656),
 (u'in', 521),
 (u'Retrieved', 491),
 (u'and', 489)]

In [28]:
(word_counts
 .map(lambda (k, v): (k.lower(), v))
 .filter(lambda (k, _): re.match('\w+', k))
 .reduceByKey(add)
 .takeOrdered(5, key=lambda (_, v): -v))

[(u'the', 1373),
 (u'of', 660),
 (u'in', 606),
 (u'and', 495),
 (u'retrieved', 494)]

In [35]:
class DontTryThisAtHome(object):
    def my_add(self, a, b):
        return a + b + 1
    def sketchy_top_words(self, rdd, n):
        return (rdd
                .map(lambda x: (x, 1))
                .reduceByKey(self.my_add)
                .takeOrdered(n, key=lambda (_, v): -v))

In [36]:
DontTryThisAtHome().sketchy_top_words(obamaRDD, 10)

[(u'the', 2157),
 (u'of', 1311),
 (u'in', 1041),
 (u'Retrieved', 981),
 (u'and', 977),
 (u'to', 971),
 (u'Obama', 929),
 (u'^', 913),
 (u'on', 729),
 (u'a', 691)]

In [37]:
counter = 0
def my_count(_):
    global counter
    counter += 1

In [38]:
obamaRDD.foreach(my_count)

In [39]:
counter

0

In [43]:
acc = sc.accumulator(0)

In [44]:
obamaRDD.foreach(lambda _: acc.add(1))

In [45]:
acc.value

31474

## Monitor Performance

http://localhost:4040/jobs/

## Set-like Operations

In [53]:
a = sc.parallelize([1,2,3,4,4])
b = sc.parallelize([3,4,5])

In [55]:
a.distinct().collect()

[4, 1, 2, 3]

In [58]:
a.union(b).collect()

[1, 2, 3, 4, 4, 3, 4, 5]

In [60]:
a.intersection(b).collect()

[3, 4]

In [61]:
a.cartesian(b).collect()

[(1, 3),
 (1, 4),
 (1, 5),
 (2, 3),
 (2, 4),
 (2, 5),
 (3, 3),
 (3, 4),
 (3, 5),
 (4, 3),
 (4, 3),
 (4, 4),
 (4, 4),
 (4, 5),
 (4, 5)]

## Other Operations of note

In [92]:
sc.parallelize(range(100)).sample(True, 0.1).collect()

[5, 29, 36, 58, 79, 85, 97]

In [63]:
a = sc.parallelize([('foo', 1), ('bar', 1)])
b = sc.parallelize([('bar', 2), ('baz', 2)])
a.join(b).collect()

[('bar', (1, 2))]

In [65]:
r = sc.parallelize([(1, 1), (1, 2), (2, 1)]).groupByKey().collect()
r

[(1, <pyspark.resultiterable.ResultIterable at 0x7f5e45923c90>),
 (2, <pyspark.resultiterable.ResultIterable at 0x7f5e47de8390>)]

In [66]:
for i, j in r:
    print i, list(j)

1 [1, 2]
2 [1]


## Aggregate

In [69]:
import numpy as np

In [70]:
def seqOp((N, partial_sum), value):
    return N + 1, partial_sum + value
def combOp((N1, partial_sum1), (N2, partial_sum2)):
    return N1 + N2, partial_sum1 + partial_sum2

In [72]:
values = np.random.standard_normal(100)
N, total_sum = sc.parallelize(values).aggregate((0, 0), seqOp, combOp)

In [75]:
mean = total_sum/N
print mean, values.mean()

-0.138267641169 -0.138267641169


# Application: Search using tf-idf

In [101]:
def tf(term, RDD):
    tlower = term.lower()
    def seqOp((N, term_count), term):
        if term.lower() == tlower:
            return (N+1, term_count+1)
        else:
            return (N+1, term_count)
    def combOp((N1, c1), (N2, c2)):
        return (N1+N2, c1+c2)
    wordsOnly = RDD.filter(lambda x: re.match('\w+', x))
    N, term_count = wordsOnly.aggregate((0, 0), seqOp, combOp)
    return float(term_count) / N

In [102]:
tf('obama', obamaRDD)

0.015843270868824533

In [97]:
def makeRDD(url):
    soup = BeautifulSoup(requests.get(url).text, 'lxml')
    for unwanted in soup(['script', 'style']):
        unwanted.decompose()
    text = soup.get_text()
    return sc.parallelize(text.split())

In [98]:
BushRDD = makeRDD('https://en.wikipedia.org/wiki/George_W._Bush')
TrumpRDD = makeRDD('https://en.wikipedia.org/wiki/Donald_Trump')

In [103]:
tf('obama', BushRDD)

0.0005213220727765613

In [104]:
tf('obama', TrumpRDD)

0.0007051753740495463

In [105]:
RDDS = [BushRDD, TrumpRDD, obamaRDD]
names = 'Bush Trump Obama'.split()

In [108]:
for name, RDD in zip(names, RDDS):
    print name, tf('war', RDD)

Bush 0.00163347582803
Trump 0.000275938189845
Obama 0.000783645655877


In [49]:
for RDD in RDDS:
    print tf('hawaii', RDD)

3.47548048518e-05
6.12444879961e-05
0.000442930153322


In [54]:
for RDD in RDDS:
    print tf('hotels', RDD)

0.0
0.000244977951984
0.0


In [64]:
from math import log

In [111]:
def idf(term, corpus):
    N = float(len(corpus))
    return log(1 + N / sum(1 for RDD in corpus if tf(term, RDD) > 0))

In [119]:
def search(term):
    i = idf(term, RDDS)
    for name, RDD in zip(names, RDDS):
        t = tf(term, RDD)
        print name, t*i*100

In [120]:
search('hotels')

Bush 0.0
Trump 0.0339611553435
Obama 0.0


In [121]:
search('hawaii')

Bush 0.00240901949939
Trump 0.00424514441793
Obama 0.030701578696


In [122]:
search('war')

Bush 0.113223916471
Trump 0.0191031498807
Obama 0.0543181776929


In [123]:
search('a')

Bush 0.879292117278
Trump 0.891480327766
Obama 0.84547424409


In [124]:
search('the')

Bush 3.960428057
Trump 3.0055622479
Obama 3.24255904228


In [125]:
search('trump')

Bush 0.0192721559951
Trump 1.93154071016
Obama 0.0165316192979


In [126]:
search('obama')

Bush 0.0361352924909
Trump 0.0488191608062
Obama 1.09817185336


## DataFrames

In [11]:
with open('data/winequality-white.csv') as wine_file:
    header = wine_file.readline()
    wines = sc.parallelize(wine_file.readlines())

In [3]:
wines.take(5)

['7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6\n',
 '6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9.5;6\n',
 '8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;10.1;6\n',
 '7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4;9.9;6\n',
 '7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4;9.9;6\n']

In [12]:
header = [i.strip().replace('"','') for i in header.split(';')]

In [13]:
header

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [14]:
from pyspark.sql import Row

In [16]:
Row(name='Jack Sparrow', rank='Captain')

Row(name='Jack Sparrow', rank='Captain')

In [19]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("STK-INF4000 White Wine Winners") \
    .getOrCreate()

In [20]:
(wines
 .map(lambda x: [float(i.strip()) for i in x.split(';')])
 .map(lambda x: Row(**{label: value for label, value in zip(header, x)}))
 .take(2))

[Row(alcohol=8.8, chlorides=0.045, citric acid=0.36, density=1.001, fixed acidity=7.0, free sulfur dioxide=45.0, pH=3.0, quality=6.0, residual sugar=20.7, sulphates=0.45, total sulfur dioxide=170.0, volatile acidity=0.27),
 Row(alcohol=9.5, chlorides=0.049, citric acid=0.34, density=0.994, fixed acidity=6.3, free sulfur dioxide=14.0, pH=3.3, quality=6.0, residual sugar=1.6, sulphates=0.49, total sulfur dioxide=132.0, volatile acidity=0.3)]

In [21]:
wineDF = spark.createDataFrame(
wines
 .map(lambda x: [float(i.strip()) for i in x.split(';')])
 .map(lambda x: Row(**{label: value for label, value in zip(header, x)}))
)

In [32]:
wineDF['alcohol', 'quality', 'residual sugar'].show(5)

+-------+-------+--------------+
|alcohol|quality|residual sugar|
+-------+-------+--------------+
|    8.8|    6.0|          20.7|
|    9.5|    6.0|           1.6|
|   10.1|    6.0|           6.9|
|    9.9|    6.0|           8.5|
|    9.9|    6.0|           8.5|
+-------+-------+--------------+
only showing top 5 rows



In [31]:
wineDF.select(wineDF['alcohol'] + 1).show(5)

+-------------+
|(alcohol + 1)|
+-------------+
|          9.8|
|         10.5|
|         11.1|
|         10.9|
|         10.9|
+-------------+
only showing top 5 rows



In [33]:
wineDF.filter(wineDF['alcohol'] > 11).select('quality').show(5)

+-------+
|quality|
+-------+
|    5.0|
|    7.0|
|    7.0|
|    8.0|
|    6.0|
+-------+
only showing top 5 rows



In [34]:
from pyspark.sql import functions as func

In [42]:
wineDF.groupBy(wineDF['quality']).agg(func.mean('alcohol'), func.count('quality')).orderBy('quality').show()

+-------+------------------+--------------+
|quality|      avg(alcohol)|count(quality)|
+-------+------------------+--------------+
|    3.0|            10.345|            20|
|    4.0| 10.15245398773006|           163|
|    5.0| 9.808840082361021|          1457|
|    6.0|10.575371549893836|          2198|
|    7.0|11.367935606060604|           880|
|    8.0|11.635999999999994|           175|
|    9.0|             12.18|             5|
+-------+------------------+--------------+

