## TASK

Create MapReduce Job which will calculate median and standard deviation of length of the stackoverflow badge name per day. If possible try to leverage the combiner optimization technique. 

NOTE: Before you try working on Map Reduce Job try to achieve the same using pure python and working only on a single `0.xml` (available in the current directory).

In [19]:
from glob import glob
from datetime import datetime
from uuid import uuid4
import statistics

from delorean import parse
import requests
import xmltodict

from job import Job


get_stackoverflow_badges_uri = (
    'https://s3.eu-central-1.amazonaws.com/learning.big.data/stackoverflow-badges/{}'.format)

## Records Reader

In [4]:
def record_reader(line):
    record = dict(xmltodict.parse(line.decode('utf-8'))['row'])        
    yield (
        record['@Id'],
        {k.replace('@', '').lower(): v for k, v in record.items()},
    )            

In [5]:
# -- test `record_reader`
response = requests.get(get_stackoverflow_badges_uri('0.xml'), stream=True)

records = []
for line in response.iter_lines():
    if line:
        records.append(next(record_reader(line)))
        
print(records[:2])

[('26066242', {'id': '26066242', 'userid': '8125167', 'name': 'Supporter', 'date': '2017-11-28T19:34:25.047', 'class': '3', 'tagbased': 'False'}), ('26066243', {'id': '26066243', 'userid': '9006638', 'name': 'Supporter', 'date': '2017-11-28T19:34:25.047', 'class': '3', 'tagbased': 'False'})]


## Mapper

In [14]:
def mapper(key, value):
    yield (
        parse(value['date']).date.strftime('%Y-%m-%d'), 
        len(value['name']),
    )

In [15]:
# -- test mapper
[next(mapper(key, value)) for key, value in records[:2]]

[('2017-11-28', 9), ('2017-11-28', 9)]

## Reducer

In [23]:
def reducer(key, values):
    median = statistics.median(values)
    
    stddev = None
    if len(values) > 1:
        stddev = statistics.stdev(values)
    
    yield (key, {'median': median, 'stddev': stddev})

In [24]:
# -- test reducer
next(reducer(
    '2017-11-28', 
    [11, 3, 5]))

('2017-11-28', {'median': 5, 'stddev': 4.163331998932266})

## Job

In [29]:
Job(
    input_uris=[
        get_stackoverflow_badges_uri('0.xml'),
        get_stackoverflow_badges_uri('1.xml'),
        get_stackoverflow_badges_uri('2.xml'),   
        get_stackoverflow_badges_uri('3.xml'),           
    ], 
    record_reader=record_reader,
    mapper=mapper,  
    reducer=reducer,
).run()




JOB ID: 582faa8e-d427-4ed2-a42d-a5d402fa10bc

INPUT SIZE: 46239976

OUTPUT PATH: /home/jovyan/work/map_reduce/.outputs/582faa8e-d427-4ed2-a42d-a5d402fa10bc

EXECUTION TIME: 35.826

MAX SHUFFLE SIZE: 1366064

FILES:
+----------------------------+--------------+
|          filename          | size (bytes) |
| mapper_0__partition_0.json | 54252        |
+----------------------------+--------------+
| mapper_0__partition_1.json | 91398        |
+----------------------------+--------------+
| mapper_0__partition_2.json | 177546       |
+----------------------------+--------------+
| mapper_0__partition_3.json | 25636        |
+----------------------------+--------------+
| mapper_1__partition_0.json | 95311        |
+----------------------------+--------------+
| mapper_1__partition_1.json | 72618        |
+----------------------------+--------------+
| mapper_1__partition_2.json | 89980        |
+----------------------------+--------------+
| mapper_1__partition_3.json | 88924        |


In [30]:
!head -n 100 ./.outputs/582faa8e-d427-4ed2-a42d-a5d402fa10bc/reducer_2.json

{"key": "2017-11-29", "value": {"median": 9, "stddev": 3.6499380305093303}}
{"key": "2017-06-07", "value": {"median": 14, "stddev": null}}
{"key": "2017-03-12", "value": {"median": 9, "stddev": 3.5674056089547146}}
{"key": "2017-04-12", "value": {"median": 10.0, "stddev": 3.751857956304789}}
{"key": "2017-05-12", "value": {"median": 10, "stddev": 3.634443025668692}}
{"key": "2016-09-27", "value": {"median": 9, "stddev": 3.5544175299423855}}
{"key": "2016-07-29", "value": {"median": 9.0, "stddev": 3.5938836835244454}}
{"key": "2016-07-09", "value": {"median": 11, "stddev": null}}
{"key": "2016-07-28", "value": {"median": 9, "stddev": 3.650370422739903}}
{"key": "2016-03-10", "value": {"median": 9, "stddev": 3.6000884968531026}}
{"key": "2016-07-30", "value": {"median": 9, "stddev": 3.389473575449482}}
{"key": "2016-01-08", "value": {"median": 9.0, "stddev": 3.6520613406462354}}
{"key": "2015-11-19", "value": {"median": 8.0, "stddev": 3.61529728895176}}
