In [28]:
from glob import glob
from uuid import uuid4

from faker import Faker

faker = Faker()

## Prepare fake data

In [45]:
for i in range(100):
    with open(f'./wordcount/{i}.txt', 'w') as f:
        f.write(faker.paragraph(100))

In [60]:
!ls ./wordcount/

0.txt	19.txt	28.txt	37.txt	46.txt	55.txt	64.txt	73.txt	82.txt	91.txt
10.txt	1.txt	29.txt	38.txt	47.txt	56.txt	65.txt	74.txt	83.txt	92.txt
11.txt	20.txt	2.txt	39.txt	48.txt	57.txt	66.txt	75.txt	84.txt	93.txt
12.txt	21.txt	30.txt	3.txt	49.txt	58.txt	67.txt	76.txt	85.txt	94.txt
13.txt	22.txt	31.txt	40.txt	4.txt	59.txt	68.txt	77.txt	86.txt	95.txt
14.txt	23.txt	32.txt	41.txt	50.txt	5.txt	69.txt	78.txt	87.txt	96.txt
15.txt	24.txt	33.txt	42.txt	51.txt	60.txt	6.txt	79.txt	88.txt	97.txt
16.txt	25.txt	34.txt	43.txt	52.txt	61.txt	70.txt	7.txt	89.txt	98.txt
17.txt	26.txt	35.txt	44.txt	53.txt	62.txt	71.txt	80.txt	8.txt	99.txt
18.txt	27.txt	36.txt	45.txt	54.txt	63.txt	72.txt	81.txt	90.txt	9.txt


In [52]:
!cut -c-300 ./wordcount/12.txt

Repellendus voluptas molestias soluta quod aliquam. Delectus in cum architecto sint ullam. Ipsa dolor expedita ipsum reiciendis placeat. Magni enim voluptas voluptates doloremque eaque pariatur nam. Explicabo assumenda labore corrupti maiores explicabo. Ut inventore praesentium eum deleniti. Quo ea 


## Define Map Reduce Job runner

In [53]:
class Context:
    
    def __init__(self):
        self.events = []
    
    def write(self, key, value):
        self.events.append([key, value])
        
    def collect_to_dict(self):
        collected = {}
        for key, value in self.events:            
            collected.setdefault(key, [])
            collected[key].append(value)
        
        return collected
    
    
class Job:
    
    def __init__(self, input_path, mapper, reducer):
        self.input_path = input_path
        self.mapper = mapper
        self.reducer = reducer
        
    def run(self):
        
        # -- key
        mapper_context = Context()        
        for filepath in glob(self.input_path):
            with open(filepath, 'r') as f:
                self.mapper(
                    key=filepath, 
                    value=f.read(), 
                    context=mapper_context)
        
        # -- reduce
        reducer_context = Context()        
        for key, values in mapper_context.collect_to_dict().items():
            self.reducer(key, values, reducer_context)
            
        # -- write output
        output_path = f'./outputs/{uuid4()}.txt'
        with open(output_path, 'w') as f:
            output = '\n'.join([
                '{0},{1}'.format(*event) 
                for event in reducer_context.events
            ])            
            f.write(output)
            
        return output_path

## Define Mapper and Reducer

In [56]:
def mapper(key, value, context):
    words = value.replace('.', '').lower().split()
    
    for word in words:
        context.write(word, 1)

        
def reducer(key, values, context):
    context.write(key, sum(values))

In [57]:
Job('./wordcount/*.txt', mapper, reducer).run()

'./outputs/7d45da75-82d4-4e3a-ad7c-ba521d9088f9.txt'

In [58]:
!head -n 10 ./outputs/7d45da75-82d4-4e3a-ad7c-ba521d9088f9.txt

culpa,330
ipsum,326
tenetur,350
quas,363
distinctio,355
sit,346
sed,348
cumque,372
repellendus,321
deleniti,372


## Variations --> Homework

1. Instead of connecting to local file system make all IO through S3 (or Azure Blob Storage) --> FIXME: prepare sample files!!!
1. Try to paralelize the mapper and reducer (save context to files)
1. Think about this job running on multiple machines 