# MapReduce

## ... for pedestrians

In [None]:
def square(x):
    return x**2

In [None]:
map(square, range(10))

In [None]:
[x**2 for x in range(10)]

In [None]:
def add_numbers(x, y):
    return x + y

In [None]:
reduce(add_numbers, range(10))

In [None]:
sum(range(10))

In [None]:
add_numbers(0, add_numbers(1, add_numbers(2, add_numbers(3, 4))))

In [None]:
reduce(add_numbers, range(5))

## My first MapReduce

In [None]:
reduce(add_numbers, map(square, range(10)))

### but what about the keys?

In [None]:
pairs = map(lambda x: (x % 2 == 0, x**2 + x), range(10))

In [None]:
pairs

In [None]:
from collections import defaultdict

In [None]:
def reduceByKey(reduce_fn, iterable):
    chunks = defaultdict(list)
    for k, v in iterable:
        chunks[k].append(v)
    for k in chunks:
        chunks[k] = reduce(reduce_fn, chunks[k])
    return chunks

In [None]:
reduceByKey(add_numbers, pairs)

## Staring Spark Locally

    PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS=notebook ~/src/spark-2.1.0-bin-hadoop2.7/bin/pyspark 

## The spark context

In [None]:
sc

## Baby steps

In [None]:
RDD = sc.parallelize(range(10))

In [None]:
RDD

In [None]:
RDD.collect()

In [None]:
RDD.count()

In [None]:
RDD.map(lambda x: x**2).collect()

## Loading data

In [None]:
wine = sc.textFile('data/winequality-white.csv')

In [None]:
wine.take(5)

In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
obama_url = 'https://en.wikipedia.org/wiki/Barack_Obama'
obama_soup = BeautifulSoup(requests.get(obama_url).text, 'lxml')

for unwanted in  obama_soup(['script', 'style']):
    unwanted.decompose()
obama_text = obama_soup.get_text()

obamaRDD = sc.parallelize(obama_text.split())

## Basic RDD operations

In [None]:
obamaRDD.take(10)

In [None]:
word_lengths = obamaRDD.map(lambda x: len(x))

In [None]:
word_lengths.take(5)

In [None]:
import re

In [None]:
re.match('\w+', '') == None

In [None]:
re.match('\w+', '-') == None

In [None]:
re.match('\w+', 'Obama') == None

In [None]:
clean_RDD = obamaRDD.filter(lambda x: re.match('\w+', x))

In [None]:
clean_RDD.take(5)

In [None]:
word_lengths = clean_RDD.map(lambda x: len(x))

In [None]:
number_of_words = clean_RDD.count()

In [None]:
mean_wl = word_lengths.reduce(lambda x, y: x + y) / float(number_of_words)

In [None]:
mean_wl

In [None]:
from operator import add

In [None]:
word_lengths.reduce(add) / float(number_of_words)

In [None]:
word_lengths.sum()

In [None]:
word_counts = (
    clean_RDD
      .map(lambda x: (x, 1))
      .reduceByKey(add))

In [None]:
word_counts.take(5)

In [None]:
word_counts = (
    clean_RDD
      .map(lambda x: x.lower())
      .map(lambda x: (x, 1))
      .reduceByKey(add))

In [None]:
word_counts.take(5)

In [None]:
word_counts.takeOrdered(10, lambda (_, v): -v)

In [None]:
class DonTryThisAtHome(object):
    def my_add(self, a, b):
        return a + b + 1
    def sketchy_top_words(self, rdd, n):
        return (rdd
               .map(lambda x: x.lower())
               .map(lambda x: (x, 1))
               .reduceByKey(self.my_add)
               .takeOrdered(5, lambda (_, v): -v))

In [None]:
DonTryThisAtHome().sketchy_top_words(clean_RDD, 5)

In [None]:
counter = 0
def my_count(_):
    global counter
    counter += 1

In [None]:
my_count(123122)

In [None]:
counter

In [None]:
obamaRDD.foreach(my_count)

In [None]:
counter

In [None]:
sc.parallelize(['a', 1.2, []]).collect()

In [None]:
acc = sc.accumulator(0)

In [None]:
clean_RDD.foreach(lambda _: acc.add(1))

In [None]:
acc.value

## Monitor Performance

http://localhost:4040/jobs/

## Set-like Operations

In [None]:
a = sc.parallelize([1,2,3,4,4])
b = sc.parallelize([3,4,5])

In [None]:
a.distinct().collect()

In [None]:
a.union(b).collect()

In [None]:
a.intersection(b).collect()

In [None]:
a.cartesian(b).take(10)

In [None]:
a.mean(), a.stdev(), a.sum()

## Other Operations of note

In [None]:
sc.parallelize(range(100)).sample(True, 0.1).collect()

In [None]:
a = sc.parallelize([('a', 1), ('b', 2)])
b = sc.parallelize([('c', 3), ('b', 4)])
a.join(b).collect()

In [None]:
r = sc.parallelize([(1, 1), (1, 2), (2, 1)]).groupByKey().collect()

In [None]:
r

In [None]:
[(k, list(v)) for k,v in r]

## Aggregate

In [None]:
import numpy as np

In [None]:
def seqOp((N, partial_sum), value):
    return N + 1, partial_sum + value
def combOp((N1, partial_sum_1), (N2, partial_sum_2)):
    return N1 + N2, partial_sum_1 + partial_sum_2
values = np.random.standard_normal(1000)
N, value_sum = sc.parallelize(values).aggregate((0, 0), seqOp, combOp)

In [None]:
mean = value_sum / float(N)

In [None]:
mean

In [None]:
values.mean()

# Application: Search using tf-idf

In [None]:
def tf(term, RDD):
    tlower = term.lower()
    def seqOp((N, term_count), value):
        if value.lower() == tlower:
            return N+1, term_count+1
        else:
            return N+1, term_count
    def combOp((N1, c1), (N2, c2)):
        return N1+N2, c1+c2
    N, term_count = RDD.aggregate((0, 0), seqOp, combOp)
    return float(term_count) / N

In [None]:
tf('and', clean_RDD)

In [None]:
tf('a', clean_RDD)

In [None]:
tf('obama', clean_RDD)

In [None]:
def makeRDD(url):
    soup = BeautifulSoup(requests.get(url).text, 'lxml')
    for unwanted in soup(['script', 'style']):
        unwanted.decompose()
    text = soup.get_text()
    return sc.parallelize(text.split()).filter(lambda x: re.match('\w+', x))

In [None]:
bushRDD = makeRDD('https://en.wikipedia.org/wiki/George_W._Bush')

In [None]:
tf('obama', bushRDD)

In [None]:
trumpRDD = makeRDD('https://en.wikipedia.org/wiki/Donald_Trump')

In [None]:
def search(term):
    for RDD, name in [(clean_RDD, 'obama'), (bushRDD, 'bush'), (trumpRDD, 'trump')]:
        print name, tf(term, RDD)

In [None]:
search('hotels')

In [None]:
search('war')

In [None]:
search('hawaii')