# Word Count Tutorial

In [1]:
%%writefile mr_wc.py

from mrjob.job import MRJob


class MRWordFrequencyCount(MRJob):

    def mapper(self, _, line):
        yield "chars", len(line)
        yield "words", len(line.split())
        yield "lines", 1

    def reducer(self, key, values):
        yield key, sum(values)


if __name__ == '__main__':
    MRWordFrequencyCount.run()

Overwriting mr_wc.py


In [2]:
%%time
!python mr_wc.py < shakespeare.txt

"chars"	5333743
"lines"	124456
"words"	901325
Wall time: 8.03 s


No configs found; falling back on auto-configuration
Creating temp directory c:\users\eugene\appdata\local\temp\mr_wc.Eugene.20170718.044612.394000
Running step 1 of 1...
reading from STDIN
Streaming final output from c:\users\eugene\appdata\local\temp\mr_wc.Eugene.20170718.044612.394000\output...
Removing temp directory c:\users\eugene\appdata\local\temp\mr_wc.Eugene.20170718.044612.394000...


# Actual Word Count Tutorial

In [3]:
%%writefile mr_word_counter.py
from mrjob.job import MRJob


class MRWordFrequencyCount(MRJob):

    def mapper(self, _, line):
        for word in line.lower().split():
            yield (word, 1)

    def combiner(self, word, aggregated_counts):
        yield word, sum(aggregated_counts)

    def reducer(self, key, count):
        yield key, sum(count)


if __name__ == '__main__':
    MRWordFrequencyCount.run()

Overwriting mr_word_counter.py


In [4]:
%%time
!python mr_word_counter.py < shakespeare.txt > temp_shakespeare_counter_results.txt

# sort by second key in reverse order
!cat temp_shakespeare_counter_results.txt | sort --key 2nr -n | head -20


No configs found; falling back on auto-configuration
Creating temp directory c:\users\eugene\appdata\local\temp\mr_word_counter.Eugene.20170718.044620.432000
Running step 1 of 1...
reading from STDIN
Streaming final output from c:\users\eugene\appdata\local\temp\mr_word_counter.Eugene.20170718.044620.432000\output...
Removing temp directory c:\users\eugene\appdata\local\temp\mr_word_counter.Eugene.20170718.044620.432000...


"the"	27549
"and"	26037
"i"	19540
"to"	18700
"of"	18010
"a"	14383
"my"	12455
"in"	10671
"you"	10630
"that"	10487
"is"	9145
"for"	7982
"with"	7931
"not"	7643
"your"	6871
"his"	6749
"be"	6700
"but"	5886
"he"	5884
"as"	5882
Wall time: 14.7 s


sort: write failed: 'standard output'
sort: write error


In [5]:
from collections import Counter

counter_manual = Counter()
with open('shakespeare.txt') as f:
    for line in f:
        counter_manual.update(line.lower().split())

print(counter_manual.most_common()[:10])

[('the', 27549), ('and', 26037), ('i', 19540), ('to', 18700), ('of', 18010), ('a', 14383), ('my', 12455), ('in', 10671), ('you', 10630), ('that', 10487)]


In [6]:
counter_mrjob = Counter()

with open('temp_shakespeare_counter_results.txt') as f:
    for line in f:
        word, count = line.strip().split('\t')
        counter_mrjob[word.strip('"')] = int(count)

print(counter_mrjob.most_common()[:10])

[('the', 27549), ('and', 26037), ('i', 19540), ('to', 18700), ('of', 18010), ('a', 14383), ('my', 12455), ('in', 10671), ('you', 10630), ('that', 10487)]


In [7]:
(counter_manual - counter_mrjob).most_common()[:10]
# close enough!

[('"', 241),
 ('"a', 4),
 ('"i', 4),
 ('sail!"', 3),
 ('print!"', 3),
 ('"small', 3),
 ('"caesar."', 2),
 ('"thus', 2),
 ('"fear', 2),
 ('"give', 2)]

# EMR Word Counter

In [1]:
%%writefile EMR_word_counter_mapper.py
#!/usr/bin/python

import sys

for line in sys.stdin:
    words = line.strip().lower().split()
    for word in words:
            print("{}\t{}".format(word, 1))

Overwriting EMR_word_counter_mapper.py


In [2]:
%%writefile EMR_word_counter_reducer.py
#!/usr/bin/python

import sys

current_word = None
current_count = None

for line in sys.stdin:
    word, count = line.split('\t')
    count = int(count)
    if current_word == word:
        current_count += 1
    else:
        if current_word:
            print("{}\t{}".format(current_word, current_count))
        current_word = word
        current_count = 1
if current_word:
    print("{}\t{}".format(current_word, current_count))

Overwriting EMR_word_counter_reducer.py


In [3]:
!dos2unix EMR_word_counter_mapper.py EMR_word_counter_mapper.py
!dos2unix EMR_word_counter_reducer.py EMR_word_counter_reducer.py

dos2unix: converting file EMR_word_counter_mapper.py to Unix format...
dos2unix: converting file EMR_word_counter_mapper.py to Unix format...
dos2unix: converting file EMR_word_counter_reducer.py to Unix format...
dos2unix: converting file EMR_word_counter_reducer.py to Unix format...


In [4]:
%%time
!cat shakespeare.txt | python EMR_word_counter_mapper.py | sort | python EMR_word_counter_reducer.py > temp_shakespeare_manual_counter.txt

Wall time: 8.42 s


In [5]:
from collections import Counter

counter_EMR_manual = Counter()

with open('temp_shakespeare_manual_counter.txt') as f:
    for line in f:
        word, count = line.strip().split('\t')
        counter_EMR_manual[word.strip('"')] = int(count)

print(counter_EMR_manual.most_common()[:10])

[('the', 27549), ('and', 26037), ('i', 19540), ('to', 18700), ('of', 18010), ('a', 14383), ('my', 12455), ('in', 10671), ('you', 10630), ('that', 10487)]


# Download EMR results from S3

In [1]:
import boto

AWS_credentials = {}
with open('rootkey.csv') as f:
    for line in f:
        if 'AWSAccessKeyId' in line:
            AWS_credentials['aws_access_key_id'] = line.strip().split('=')[1]
        elif 'AWSSecretKey' in line:
            AWS_credentials['aws_secret_access_key'] = line.strip().split('=')[1]

conn = boto.connect_s3(**AWS_credentials)
bucket = conn.get_bucket('map-reduce-practice')

In [2]:
import os

output_dir = 'output_data/word_counter/'

if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

for key in bucket.list():
    if output_dir in key.key:
        print key.key
        file_name = key.key.split('/')[-1]
        key.get_contents_to_filename(output_dir + file_name)

output_data/word_counter/_SUCCESS
output_data/word_counter/part-00000
output_data/word_counter/part-00001
output_data/word_counter/part-00002
output_data/word_counter/part-00003
output_data/word_counter/part-00004
output_data/word_counter/part-00005
output_data/word_counter/part-00006


# Check if manual map-reduce, manual EMR, real EMR results match

In [1]:
from collections import Counter

counter_manual = Counter()
with open('shakespeare.txt') as f:
    for line in f:
        counter_manual.update(line.lower().split())

In [2]:
counter_EMR_manual = Counter()
with open('temp_shakespeare_manual_counter.txt') as f:
    for line in f:
        word, count = line.strip().split('\t')
        counter_EMR_manual[word] += int(count)

In [3]:
import os

output_dir = 'output_data/word_counter/'
counter_EMR = Counter()

for file_name in os.listdir(output_dir):
    with open(output_dir + file_name) as f:
        for line in f:
            word, count = line.strip().split('\t')
            counter_EMR[word] += int(count)

In [4]:
(counter_manual == counter_EMR_manual) and (counter_EMR_manual == counter_EMR) # all are identical

True