## TASK

Create MapReduce Job which will calculate average length of the stackoverflow badge name per day. If possible try to leverage the combiner optimization technique. 

NOTE: Before you try working on Map Reduce Job try to achieve the same using pure python and working only on a single `0.xml` (available in the current directory).

In [1]:
from glob import glob
from datetime import datetime
from uuid import uuid4
from delorean import parse

import requests
import xmltodict

from job import Job


get_stackoverflow_badges_uri = (
    'https://s3.eu-central-1.amazonaws.com/learning.big.data/stackoverflow-badges/{}'.format)

## Records Reader

In [2]:
def record_reader(line):
    record = dict(xmltodict.parse(line.decode('utf-8'))['row'])        
    yield (
        record['@Id'],
        {k.replace('@', '').lower(): v for k, v in record.items()},
    )            

In [3]:
# -- test `record_reader`
response = requests.get(get_stackoverflow_badges_uri('0.xml'), stream=True)

records = []
for line in response.iter_lines():
    if line:
        records.append(next(record_reader(line)))
        
print(records[:2])

[('26066242', {'id': '26066242', 'userid': '8125167', 'name': 'Supporter', 'date': '2017-11-28T19:34:25.047', 'class': '3', 'tagbased': 'False'}), ('26066243', {'id': '26066243', 'userid': '9006638', 'name': 'Supporter', 'date': '2017-11-28T19:34:25.047', 'class': '3', 'tagbased': 'False'})]


## Mapper

In [10]:
def mapper(key, value):
    yield (
        parse(value['date']).date.strftime('%Y-%m-%d'), 
        {'avg': len(value['name']), 'count': 1},
    )

In [11]:
# -- test mapper
[next(mapper(key, value)) for key, value in records[:10]]

[('2017-11-28', {'avg': 9, 'count': 1}),
 ('2017-11-28', {'avg': 9, 'count': 1}),
 ('2017-11-28', {'avg': 9, 'count': 1}),
 ('2017-11-28', {'avg': 9, 'count': 1}),
 ('2017-11-28', {'avg': 10, 'count': 1}),
 ('2017-11-28', {'avg': 7, 'count': 1}),
 ('2017-11-28', {'avg': 7, 'count': 1}),
 ('2017-11-28', {'avg': 7, 'count': 1}),
 ('2017-11-28', {'avg': 7, 'count': 1}),
 ('2017-11-28', {'avg': 8, 'count': 1})]

## Reducer

In [16]:
def reducer(key, values):
    
    numerator = 0
    count = 0
    for value in values:
        numerator += value['avg'] * value['count'] 
        count += value['count']
        
    yield (key, {'avg': numerator / count, 'count': count})

In [17]:
# -- test reducer
next(reducer(
    '2017-11-28', 
    [
        {'avg': 11, 'count': 1},
        {'avg': 12, 'count': 1},
        {'avg': 15, 'count': 2},        
    ]))

('2017-11-28', {'avg': 13.25, 'count': 4})

## Job

In [21]:
Job(
    input_uris=[
        get_stackoverflow_badges_uri('0.xml'),
        get_stackoverflow_badges_uri('1.xml'),
        get_stackoverflow_badges_uri('2.xml'),   
        get_stackoverflow_badges_uri('3.xml'),           
    ], 
    record_reader=record_reader,
    mapper=mapper,  
    combiner=reducer,
    reducer=reducer,
).run()




JOB ID: 723a95b8-d601-4fb8-8c57-c561bedcd3fe

INPUT SIZE: 46239976

OUTPUT PATH: /home/jovyan/work/map_reduce/.outputs/723a95b8-d601-4fb8-8c57-c561bedcd3fe

EXECUTION TIME: 39.0

MAX SHUFFLE SIZE: 3664

FILES:
+----------------------------+--------------+
|          filename          | size (bytes) |
| mapper_0__partition_0.json | 157          |
+----------------------------+--------------+
| mapper_0__partition_1.json | 340          |
+----------------------------+--------------+
| mapper_0__partition_2.json | 375          |
+----------------------------+--------------+
| mapper_0__partition_3.json | 78           |
+----------------------------+--------------+
| mapper_1__partition_0.json | 281          |
+----------------------------+--------------+
| mapper_1__partition_1.json | 156          |
+----------------------------+--------------+
| mapper_1__partition_2.json | 341          |
+----------------------------+--------------+
| mapper_1__partition_3.json | 600          |
+----

In [25]:
!head -n 100 ./.outputs/`ls -t .outputs | head -1`/reducer_3.json

{"key": "2015-11-20", "value": {"avg": 9.456561922365989, "count": 1082}}
{"key": "2017-02-12", "value": {"avg": 10.139097744360901, "count": 7448}}
{"key": "2016-09-23", "value": {"avg": 11.0, "count": 1}}
{"key": "2016-09-28", "value": {"avg": 10.316856950973808, "count": 14890}}
{"key": "2012-08-31", "value": {"avg": 14.0, "count": 1}}
{"key": "2016-02-10", "value": {"avg": 9.873970473970473, "count": 6435}}
{"key": "2016-07-25", "value": {"avg": 11.0, "count": 3}}
{"key": "2016-07-27", "value": {"avg": 10.340616288706396, "count": 13273}}
{"key": "2016-07-26", "value": {"avg": 11.0, "count": 1}}
{"key": "2014-05-23", "value": {"avg": 11.0, "count": 1}}
{"key": "2016-05-10", "value": {"avg": 10.283031829631327, "count": 4367}}
