# MapReduce

## ... for pedestrians

In [None]:
def square(x):
    return x**2

In [None]:
map(square, range(10))

In [None]:
[x**2 for x in range(10)]

In [None]:
map(lambda x: x**2, range(10))

In [None]:
def my_add(x, y):
    return x + y

In [None]:
reduce(my_add, range(10))

## My first MapReduce

In [None]:
reduce(my_add, map(square, range(10)))

### but what about the keys?

In [None]:
pairs = map(lambda x: (x % 2 == 0, x**2 + x), range(10))
pairs

In [None]:
from collections import defaultdict
def reduceByKey(reduce_fn, iterable):
    chunks = defaultdict(list)
    for k, v in iterable:
        chunks[k].append(v)
    for k in chunks:
        chunks[k] = reduce(reduce_fn, chunks[k])
    return chunks

In [None]:
reduceByKey(my_add, pairs)

## Staring Spark Locally

    PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS=notebook ~/src/spark-2.1.0-bin-hadoop2.7/bin/pyspark 

## The spark context

In [None]:
sc

## Baby steps

In [None]:
RDD = sc.parallelize([1,2,3,4,5])

In [None]:
RDD

In [None]:
RDD.collect()

In [None]:
RDD.count()

In [None]:
RDD.map(lambda x: x**2).collect()

## Loading data

In [None]:
wine = sc.textFile('data/winequality-white.csv')

In [None]:
wine.take(5)

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
obama_url = 'https://en.wikipedia.org/wiki/Barack_Obama'
obama_soup = BeautifulSoup(requests.get(obama_url).text, 'lxml')
for unwanted in obama_soup(['script', 'style']):
    unwanted.decompose()
obama_text = obama_soup.get_text()

In [None]:
obamaRDD = sc.parallelize(obama_text.split())

In [None]:
obamaRDD.take(5)

## Basic RDD operations

In [None]:
word_lengths = obamaRDD.map(lambda x: len(x))

In [None]:
word_lengths.take(5)

In [None]:
import re

In [None]:
clean_RDD = obamaRDD.filter(lambda x: re.match('\w+', x))

In [None]:
clean_RDD.take(5)

In [None]:
word_lengths = clean_RDD.map(lambda x: len(x))

In [None]:
word_count = word_lengths.count()
word_count

In [None]:
text_length = word_lengths.reduce(lambda x, y: x + y)

In [None]:
text_length

In [None]:
# mean word lenght
text_length / float(word_count)

In [None]:
from operator import add

In [None]:
add(1,2)

In [None]:
word_lengths.reduce(add)

In [None]:
word_counts = (
    obamaRDD
      .map(lambda x: (x, 1))
      .reduceByKey(add))

In [None]:
word_counts.take(5)

In [None]:
word_counts.takeOrdered(5, key=lambda (_, v): -v)

In [None]:
(word_counts
 .map(lambda (k, v): (k.lower(), v))
 .filter(lambda (k, _): re.match('\w+', k))
 .reduceByKey(add)
 .takeOrdered(5, key=lambda (_, v): -v))

In [None]:
class DontTryThisAtHome(object):
    def my_add(self, a, b):
        return a + b + 1
    def sketchy_top_words(self, rdd, n):
        return (rdd
                .map(lambda x: (x, 1))
                .reduceByKey(self.my_add)
                .takeOrdered(n, key=lambda (_, v): -v))

In [None]:
DontTryThisAtHome().sketchy_top_words(obamaRDD, 10)

In [None]:
counter = 0
def my_count(_):
    global counter
    counter += 1

In [None]:
obamaRDD.foreach(my_count)

In [None]:
counter

In [None]:
acc = sc.accumulator(0)

In [None]:
obamaRDD.foreach(lambda _: acc.add(1))

In [None]:
acc.value

## Monitor Performance

http://localhost:4040/jobs/

## Set-like Operations

In [None]:
a = sc.parallelize([1,2,3,4,4])
b = sc.parallelize([3,4,5])

In [None]:
a.distinct().collect()

In [None]:
a.union(b).collect()

In [None]:
a.intersection(b).collect()

In [None]:
a.cartesian(b).collect()

## Other Operations of note

In [None]:
sc.parallelize(range(100)).sample(True, 0.1).collect()

In [None]:
a = sc.parallelize([('foo', 1), ('bar', 1)])
b = sc.parallelize([('bar', 2), ('baz', 2)])
a.join(b).collect()

In [None]:
r = sc.parallelize([(1, 1), (1, 2), (2, 1)]).groupByKey().collect()
r

In [None]:
for i, j in r:
    print i, list(j)

## Aggregate

In [None]:
import numpy as np

In [None]:
def seqOp((N, partial_sum), value):
    return N + 1, partial_sum + value
def combOp((N1, partial_sum1), (N2, partial_sum2)):
    return N1 + N2, partial_sum1 + partial_sum2

In [None]:
values = np.random.standard_normal(100)
N, total_sum = sc.parallelize(values).aggregate((0, 0), seqOp, combOp)

In [None]:
mean = total_sum/N
print mean, values.mean()

# Application: Search using tf-idf

In [None]:
def tf(term, RDD):
    tlower = term.lower()
    def seqOp((N, term_count), term):
        if term.lower() == tlower:
            return (N+1, term_count+1)
        else:
            return (N+1, term_count)
    def combOp((N1, c1), (N2, c2)):
        return (N1+N2, c1+c2)
    wordsOnly = RDD.filter(lambda x: re.match('\w+', x))
    N, term_count = wordsOnly.aggregate((0, 0), seqOp, combOp)
    return float(term_count) / N

In [None]:
tf('obama', obamaRDD)

In [None]:
def makeRDD(url):
    soup = BeautifulSoup(requests.get(url).text, 'lxml')
    for unwanted in soup(['script', 'style']):
        unwanted.decompose()
    text = soup.get_text()
    return sc.parallelize(text.split())

In [None]:
BushRDD = makeRDD('https://en.wikipedia.org/wiki/George_W._Bush')
TrumpRDD = makeRDD('https://en.wikipedia.org/wiki/Donald_Trump')

In [None]:
tf('obama', BushRDD)

In [None]:
tf('obama', TrumpRDD)

In [None]:
RDDS = [BushRDD, TrumpRDD, obamaRDD]
names = 'Bush Trump Obama'.split()

In [None]:
for name, RDD in zip(names, RDDS):
    print name, tf('war', RDD)

In [None]:
for RDD in RDDS:
    print tf('hawaii', RDD)

In [None]:
for RDD in RDDS:
    print tf('hotels', RDD)

In [None]:
from math import log

In [None]:
def idf(term, corpus):
    N = float(len(corpus))
    return log(1 + N / sum(1 for RDD in corpus if tf(term, RDD) > 0))

In [None]:
def search(term):
    i = idf(term, RDDS)
    for name, RDD in zip(names, RDDS):
        t = tf(term, RDD)
        print name, t*i*100

In [None]:
search('hotels')

In [None]:
search('hawaii')

In [None]:
search('war')

In [None]:
search('a')

In [None]:
search('the')

In [None]:
search('trump')

In [None]:
search('obama')

## DataFrames

In [None]:
with open('data/winequality-white.csv') as wine_file:
    header = wine_file.readline()
    wines = sc.parallelize(wine_file.readlines())

In [None]:
wines.take(5)

In [None]:
header = [i.strip().replace('"','') for i in header.split(';')]

In [None]:
header

In [None]:
from pyspark.sql import Row

In [None]:
Row(name='Jack Sparrow', rank='Captain')

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("STK-INF4000 White Wine Winners") \
    .getOrCreate()

In [None]:
(wines
 .map(lambda x: [float(i.strip()) for i in x.split(';')])
 .map(lambda x: Row(**{label: value for label, value in zip(header, x)}))
 .take(2))

In [None]:
wineDF = spark.createDataFrame(
wines
 .map(lambda x: [float(i.strip()) for i in x.split(';')])
 .map(lambda x: Row(**{label: value for label, value in zip(header, x)}))
)

In [None]:
wineDF['alcohol', 'quality', 'residual sugar'].show(5)

In [None]:
wineDF.select(wineDF['alcohol'] + 1).show(5)

In [None]:
wineDF.filter(wineDF['alcohol'] > 11).select('quality').show(5)

In [None]:
from pyspark.sql import functions as func

In [None]:
wineDF.groupBy(wineDF['quality']).agg(func.mean('alcohol'), func.count('quality')).orderBy('quality').show()