__`Most of the time, you’ll need more than one step in your job. To define multiple steps, override steps() and return a list of mrjob.step.MRStep.`__

__`Here’s a job that finds the most commonly used word in the input:`__

In [None]:
# %load most_used.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

WORD_RE = re.compile(r"[\w']+")


class MRMostUsedWord(MRJob):

    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words),
            MRStep(reducer=self.reducer_find_max_word)
        ]

    def mapper_get_words(self, _, line):
        # yield each word in the line
        for word in WORD_RE.findall(line):
            yield (word.lower(), 1)

    def combiner_count_words(self, word, counts):
        # optimization: sum the words we've seen so far
        yield (word, sum(counts))

    def reducer_count_words(self, word, counts):
        # send all (num_occurrences, word) pairs to the same reducer.
        # num_occurrences is so we can easily use Python's max() function.
        yield None, (sum(counts), word)

    # discard the key; it is just None
    def reducer_find_max_word(self, _, word_count_pairs):
        # each item of word_count_pairs is (count, word),
        # so yielding one results in key=counts, value=word
        yield max(word_count_pairs)


if __name__ == '__main__':
    MRMostUsedWord.run()

In [3]:
!python most_used.py -r hadoop  hdfs:///user/ashu/pg4300.txt

no configs found; falling back on auto-configuration
no configs found; falling back on auto-configuration
creating tmp directory /tmp/most_used.root.20160322.165841.848541
writing wrapper script to /tmp/most_used.root.20160322.165841.848541/setup-wrapper.sh
Using Hadoop version 2.7.2
Copying local files into hdfs:///user/root/tmp/mrjob/most_used.root.20160322.165841.848541/files/

PLEASE NOTE: Starting in mrjob v0.5.0, protocols will be strict by default. It's recommended you run your job with --strict-protocols or set up mrjob.conf as described at https://pythonhosted.org/mrjob/whats-new.html#ready-for-strict-protocols

HADOOP: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
HADOOP: packageJobJar: [/tmp/hadoop-unjar2443486007967025815/] [] /tmp/streamjob1360210150763941474.jar tmpDir=null
HADOOP: Connecting to ResourceManager at /10.211.55.101:8032
HADOOP: Connecting to ResourceManager at /10.211.55.101:8032
HADOOP: Total input pat