In [1]:
from glob import glob
from uuid import uuid4

import requests
import xmltodict

import engine


get_stackoverflow_badges_uri = (
    'https://s3.eu-central-1.amazonaws.com/learning.big.data/stackoverflow-badges/{}'.format)

## Records Reader

In [2]:
def record_reader(response):
    records = []
    for line in response.iter_lines():
        if line:
            record = dict(xmltodict.parse(line.decode('utf-8'))['row'])        
            records.append((
                record['@Id'],
                {k.replace('@', '').lower(): v for k, v in record.items()}
            ))
            
    return records

In [3]:
# -- test `record_reader`
response = requests.get(get_stackoverflow_badges_uri('0.xml'), stream=True)

records = record_reader(response)
records[:2]

[('26066242',
  {'id': '26066242',
   'userid': '8125167',
   'name': 'Supporter',
   'date': '2017-11-28T19:34:25.047',
   'class': '3',
   'tagbased': 'False'}),
 ('26066243',
  {'id': '26066243',
   'userid': '9006638',
   'name': 'Supporter',
   'date': '2017-11-28T19:34:25.047',
   'class': '3',
   'tagbased': 'False'})]

## Mapper

In [4]:
def mapper(key, value, context):
    context.write(value['userid'], value['name'])

In [5]:
# -- test mapper
context = engine.Context()
for record in records[:10]:
    mapper(*record, context=context)
    
context.events

[['8125167', 'Supporter'],
 ['9006638', 'Supporter'],
 ['4892968', 'Supporter'],
 ['3204673', 'Supporter'],
 ['1108484', 'Taxonomist'],
 ['3203282', 'Teacher'],
 ['3926187', 'Teacher'],
 ['4134228', 'Teacher'],
 ['8474041', 'Teacher'],
 ['9019981', 'Informed']]

## Reducer

In [10]:
def reducer(key, values, context):
    counts = {}
    for value in values:
        counts.setdefault(value, 0)
        counts[value] += 1
        
    context.write(key, counts)

In [12]:
# -- test reducer
context = engine.Context()
reducer('9019981', ['Teacher', 'Informed', 'Teacher'], context)

context.events

[['9019981', {'Teacher': 2, 'Informed': 1}]]

In [13]:
engine.Job(
    input_uris=[
        get_stackoverflow_badges_uri('10.xml'),
        get_stackoverflow_badges_uri('11.xml'),
        get_stackoverflow_badges_uri('12.xml'),        
    ], 
    record_reader=record_reader,
    mapper=mapper, 
    reducer=reducer).run()

'./.outputs/cefd6403-26ec-4adf-a4d3-2d6c9793ac29.txt'

In [14]:
!head -n 10 ./.outputs/cefd6403-26ec-4adf-a4d3-2d6c9793ac29.txt

486228,{'Nice Answer': 1, 'Yearling': 1, 'Revival': 1, 'Enthusiast': 1}
572,{'Nice Question': 5, 'Popular Question': 1, 'Notable Question': 3, 'Favorite Question': 1, 'Great Question': 1}
468746,{'Popular Question': 1, 'Nice Question': 1, 'Peer Pressure': 1}
915824,{'Popular Question': 1}
666891,{'Popular Question': 2}
758116,{'Popular Question': 1}
804626,{'Student': 1}
778388,{'Supporter': 1, 'Editor': 1, 'Student': 1}
1867931,{'Teacher': 1}
594572,{'Teacher': 1}
