# Word Count Sorting

In [None]:
%%writefile wc.py

#!/usr/bin/python3
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

class MyJob(MRJob):
        
    def steps(self):
        JOBCONF_STEP2 = {
            'mapred.output.key.comparator.class':'org.apache.hadoop.mapred.lib.KeyFieldBasedComparator',
            'mapred.text.key.comparator.options':'-nr',
        }
        return [
            MRStep(mapper=self.mapper,
                   combiner=self.combiner,
                   reducer=self.reducer),
            MRStep(jobconf=JOBCONF_STEP2, mapper=self.mapper_sort, reducer=self.reducer_sort)]

    def mapper(self, _, line):
        line = line.strip()
        words = line.split()
        for word in words:
            word = word.lower()
            word = re.sub(r'[^\w\s]', '', word)
            yield word, 1
            
    def combiner(self, word, counts):
        yield word, sum(counts)

    def reducer(self, word, values):
        yield word, sum(values)
        
    def mapper_sort(self, word, count):
        yield int(count), word
  
    def reducer_sort(self, counts, words):
        for word in words:
            yield counts, word
        
if __name__ == '__main__':
    MyJob.run()

In [None]:
!python wc.py /data/dataset/text/small.txt

In [None]:
!python wc.py -r hadoop hdfs:///dataset/text/small.txt

In [None]:
!python wc.py -r hadoop hdfs:///dataset/text/small.txt --output-dir hdfs:///results/wordcount/sorted/small --no-output

In [None]:
!python wc.py -r hadoop hdfs:///dataset/text/holmes.txt --output-dir hdfs:///results/wordcount/sorted/holmes --no-output

In [None]:
# took 34 minutes
!python wc.py -r hadoop hdfs:///dataset/text/gutenberg_all.txt --output-dir hdfs:///results/wordcount/sorted/gutenberg --no-output

## Another Solution (no combiner, setting reduce jobs to 10)

In [None]:
%%writefile wcTuned.py

#!/usr/bin/python3
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

class MyJob(MRJob):
        
    def steps(self):
        JOBCONF_STEP1 = {
            'mapreduce.job.reduces': 10
        }
        JOBCONF_STEP2 = {
            'mapred.output.key.comparator.class':'org.apache.hadoop.mapred.lib.KeyFieldBasedComparator',
            'mapred.text.key.comparator.options':'-nr',
        }
        return [
            MRStep(jobconf=JOBCONF_STEP1, mapper=self.mapper,      reducer=self.reducer),
            MRStep(jobconf=JOBCONF_STEP2, mapper=self.mapper_sort, reducer=self.reducer_sort)]

    def mapper(self, _, line):
        line = line.strip()
        words = line.split()
        for word in words:
            word = word.lower()
            word = re.sub(r'[^\w\s]', '', word)
            yield word, 1

    def reducer(self, word, values):
        yield word, sum(values)
        
    def mapper_sort(self, word, count):
        yield int(count), word
  
    def reducer_sort(self, counts, words):
        for word in words:
            yield counts, word
        
if __name__ == '__main__':
    MyJob.run()

In [None]:
!python wcTuned.py -r hadoop hdfs:///dataset/text/gutenberg_all.txt --output-dir hdfs:///results/wordcount/sorted/gutenberg --no-output