#DATASCI W261: Machine Learning at Scale

# Write some words to a file

In [13]:
!echo "foo foo quux labs foo bar jimi quux jimi jimi" > WordCount.txt
!echo "foo  jimi jimi" >> WordCount.txt
!echo "data mining is data science" >> WordCount.txt


# MrJob class for wordcount

In [14]:
%%writefile WordCount.py
from mrjob.job import MRJob
# from mrjob.step import MRStep
import re
 
WORD_RE = re.compile(r"[\w']+")
 
class MRWordFreqCount(MRJob):
    def mapper(self, _, line):
        for word in WORD_RE.findall(line):
            yield word.lower(), 1
     
    def combiner(self, word, counts):
        yield word, sum(counts)

    #hello, (1,1,1,1,1,1): using a combiner? NO and YEs
    def reducer(self, word, counts):
        yield word, sum(counts)

if __name__ == '__main__':
    MRWordFreqCount.run()

Overwriting WordCount.py


In [15]:
!python WordCount.py  WordCount.txt

No configs found; falling back on auto-configuration
Creating temp directory /var/folders/mq/yly0yqf16wggskk8bwchkhjmkztgr3/T/WordCount.z001gyq.20160614.180851.220183
Running step 1 of 1...
Streaming final output from /var/folders/mq/yly0yqf16wggskk8bwchkhjmkztgr3/T/WordCount.z001gyq.20160614.180851.220183/output...
"bar"	1
"data"	2
"foo"	4
"is"	1
"jimi"	5
"labs"	1
"mining"	1
"quux"	2
"science"	1
Removing temp directory /var/folders/mq/yly0yqf16wggskk8bwchkhjmkztgr3/T/WordCount.z001gyq.20160614.180851.220183...


In [16]:
!python WordCount.py  WordCount.txt --output-dir mrJobOutput

No configs found; falling back on auto-configuration
Running step 1 of 1...
Creating temp directory /var/folders/mq/yly0yqf16wggskk8bwchkhjmkztgr3/T/WordCount.z001gyq.20160614.180853.764892
Streaming final output from mrJobOutput...
"bar"	1
"data"	2
"foo"	4
"is"	1
"jimi"	5
"labs"	1
"mining"	1
"quux"	2
"science"	1
Removing temp directory /var/folders/mq/yly0yqf16wggskk8bwchkhjmkztgr3/T/WordCount.z001gyq.20160614.180853.764892...


In [17]:
!ls -l mrJobOutput
!cat mrJobOutput/part-00000

total 16
-rw-r--r--  1 z001gyq  staff  41 Jun 14 13:08 part-00000
-rw-r--r--  1 z001gyq  staff  41 Jun 14 13:08 part-00001
"bar"	1
"data"	2
"foo"	4
"is"	1
"jimi"	5


In [20]:
%%writefile WordCount.py
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
 
WORD_RE = re.compile(r"[\w']+")
 
class MRWordFreqCount(MRJob):
    SORT_VALUES = True
    def mapper(self, _, line):
        for word in WORD_RE.findall(line):
            yield word.lower(), 1
            
    def jobconfqqqq(self):  #assume we had second job to sort the word counts in decreasing order of counts
        orig_jobconf = super(MRWordFreqCount, self).jobconf()        
        custom_jobconf = {  #key value pairs
            'mapred.output.key.comparator.class': 'org.apache.hadoop.mapred.lib.KeyFieldBasedComparator',
            'mapred.text.key.comparator.options': '-k2,2nr',
            'mapred.reduce.tasks': '1',
        }
        combined_jobconf = orig_jobconf
        combined_jobconf.update(custom_jobconf)
        self.jobconf = combined_jobconf
        return combined_jobconf


    def combiner(self, word, counts):
        yield word, sum(counts)

    def reducer(self, word, counts):
        yield word, sum(counts)

    def steps(self):
        return [MRStep(
                mapper = self.mapper, 
#                #combiner = self.combiner,
                reducer = self.reducer,
                #,
#                jobconf = self.jobconfqqqq
 
#            jobconf = {'mapred.output.key.comparator.class': 'org.apache.hadoop.mapred.lib.KeyFieldBasedComparator',
#                       'mapred.text.key.comparator.options':'-k1r',
#                       'mapred.reduce.tasks' : 1}   
       
        
            )]
     


if __name__ == '__main__':
    MRWordFreqCount.run()

Overwriting WordCount.py


#Run the code in command line locally

In [21]:
!python WordCount.py --jobconf -numReduceTasks=3 WordCount.txt --output-dir mrJobOutput


# mr_your_job.py --jobconf mapred.map.tasks=23 --jobconf 
#> mapred.reduce.tasks=42 

No configs found; falling back on auto-configuration
ignoring partitioner keyword arg (requires real Hadoop): 'org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner'
Running step 1 of 1...
Creating temp directory /var/folders/mq/yly0yqf16wggskk8bwchkhjmkztgr3/T/WordCount.z001gyq.20160614.180913.070111
Streaming final output from mrJobOutput...
"bar"	1
"data"	2
"foo"	4
"is"	1
"jimi"	5
"labs"	1
"mining"	1
"quux"	2
"science"	1
Removing temp directory /var/folders/mq/yly0yqf16wggskk8bwchkhjmkztgr3/T/WordCount.z001gyq.20160614.180913.070111...


In [22]:
!ls mrJobOutput/*
!cat mrJobOutput/part-00000

mrJobOutput/part-00000 mrJobOutput/part-00001
"bar"	1
"data"	2
"foo"	4
"is"	1
"jimi"	5


The code above is straightforward. Mapper outputs (word, 1) key value pairs, and then combiner combines the sum locally. Lastly, Reducer sums them up. 

# Run the code through python driver locally

####  Reminder: You cannot use the programmatic runner functionality in the same file as your job class. That is because the file with the job class is sent to Hadoop to be run. Therefore, the job file cannot attempt to start the Hadoop job, or you would be recursively creating Hadoop jobs!

Use make_runner() to run an MRJob
1. seperate driver from mapreduce jobs
2. now we can run it within python notebook 
3. In python, typically one class is in each file. Each mrjob job is a seperate class, should be in a seperate file

In [23]:
%reload_ext autoreload
%autoreload 2
from WordCount import MRWordFreqCount
mr_job = MRWordFreqCount(args=['WordCount.txt'])
with mr_job.make_runner() as runner: 
    runner.run()
    # stream_output: get access of the output 
    for line in runner.stream_output():
        print mr_job.parse_output_line(line)

('bar', 1)
('data', 2)
('foo', 4)
('is', 1)
('jimi', 5)
('labs', 1)
('mining', 1)
('quux', 2)
('science', 1)


# Run the code in command line in AWS
- Check you configration file path
- Create .mrjob.conf
- Put your access key info in configuration  file


In [24]:
#.mrjob.conf is on my Dropbox Slides/AWS
from mrjob import conf 
conf.find_mrjob_conf()

##Create or replace .mrjob.conf file

# Run the code in command line in AWS

In [25]:
!python WordCount.py WordCount.txt -r emr

No configs found; falling back on auto-configuration
Traceback (most recent call last):
  File "WordCount.py", line 51, in <module>
    MRWordFreqCount.run()
  File "/Users/z001gyq/anaconda/envs/py27/lib/python2.7/site-packages/mrjob/job.py", line 430, in run
    mr_job.execute()
  File "/Users/z001gyq/anaconda/envs/py27/lib/python2.7/site-packages/mrjob/job.py", line 448, in execute
    super(MRJob, self).execute()
  File "/Users/z001gyq/anaconda/envs/py27/lib/python2.7/site-packages/mrjob/launch.py", line 160, in execute
    self.run_job()
  File "/Users/z001gyq/anaconda/envs/py27/lib/python2.7/site-packages/mrjob/launch.py", line 228, in run_job
    with self.make_runner() as runner:
  File "/Users/z001gyq/anaconda/envs/py27/lib/python2.7/site-packages/mrjob/job.py", line 471, in make_runner
    return super(MRJob, self).make_runner()
  File "/Users/z001gyq/anaconda/envs/py27/lib/python2.7/site-packages/mrjob/launch.py", line 171, in make_runner
    return EMRJobRunner(**self.emr_jo