## Most used vowels in a text

### Configuration

In [None]:
mydir = "mymrjob"
%env mydir = $mydir
myinput = "../data/txt/2261.txt.utf-8"
%env myinput $myinput
myscript = mydir + "/most_used_vowels.py"
%env myscript $myscript

%system mkdir -p $mydir
%env myoutput $mydir/out.txt
%env mylog $mydir/out.log

In [None]:
%%writefile $myscript

from mrjob.job import MRJob
from mrjob.step import MRStep
import re


class MRVowelsCount(MRJob):

    WORD_RE = re.compile(r"[\w']+")
    
    def mapper_get_vowels(self, _, line):
        # yield each vowel in the line
        for char in line.strip().lower():
            if char in 'aeiou':
                yield char, 1           
                
    def combiner_count_vowels(self, char, counts):
        # optimization: sum the words we've seen so far
        yield (char, sum(counts))

    def reducer_count_vowels(self, char, counts):
        # send all (num_occurrences, word) pairs to the same reducer.
        # num_occurrences is so we can easily use Python's max() function.
        yield None, (sum(counts), char)

    # discard the key; it is just None
    def reducer_find_max_vowels(self, _, char_count_pairs):
        # each item of char_count_pairs is (char, word),
        # so yielding one results in key=counts, value=char
        yield max(char_count_pairs)

    def steps(self):
        return [
            self.mr(mapper=self.mapper_get_vowels,
                    combiner=self.combiner_count_vowels,
                    reducer=self.reducer_count_vowels),
            self.mr(reducer=self.reducer_find_max_vowels)
        ]


if __name__ == '__main__':
    MRVowelsCount.run()

### Execute the code

In [None]:
! python $myscript $myinput 1> $myoutput 2> $mylog

### Print Output

In [None]:
%cat $myoutput