In [1]:
import os
from scipy.ndimage import imread
import codecs
import json
from tqdm import tqdm

input_dir = 'Cifar Pictures/'
output_file = 'cifar_data.txt'

with codecs.open(output_file, 'a+', encoding='utf-8') as f:
    for image_name in tqdm(os.listdir(input_dir)):
        image_data = imread(input_dir + image_name)
        image_dict = {image_name: image_data.tolist()}
    
        json.dump(image_dict, f, separators=(',', ':'))
        f.write('\n')

100%|███████████████████████████████████████████████████████████| 50000/50000 [12:29<00:00, 66.73it/s]


In [2]:
!wc -l $output_file
print(len(os.listdir(input_dir)))

50000 cifar_data.txt
50000


In [3]:
%%writefile cifar_map_reduce.py
import json
import numpy as np
from mrjob.job import MRJob


class MyMR(MRJob):
    
    def mapper(self, _, line):
        try:
            image_dict = json.loads(line)
            ((image_name, image_data),) = image_dict.items()            
        except:
            assert False, 'something went wrong' # print statements go into mapper output in MRJob
            return

        color_averages = np.array(image_data).mean(axis=(0, 1))
        max_color_channel = np.argmax(color_averages)
        yield (int(max_color_channel), (image_name, color_averages[max_color_channel]))
        # key has to be int, not np.int64
    
    def reducer(self, max_color_channel, max_color_intensities):
        yield max_color_channel, sorted(max_color_intensities, key=lambda tup: -tup[1])

        
if __name__ == '__main__': 
    MyMR.run()

Overwriting cifar_map_reduce.py


In [4]:
%%time
!python cifar_map_reduce.py < cifar_data.txt > temp_MRJob.txt

Wall time: 1min 3s


No configs found; falling back on auto-configuration
Creating temp directory c:\users\eugene\appdata\local\temp\cifar_map_reduce.Eugene.20170717.030616.851000
Running step 1 of 1...
reading from STDIN
Streaming final output from c:\users\eugene\appdata\local\temp\cifar_map_reduce.Eugene.20170717.030616.851000\output...
Removing temp directory c:\users\eugene\appdata\local\temp\cifar_map_reduce.Eugene.20170717.030616.851000...


In [5]:
color_channels_and_pictures = dict()
with open('temp_MRJob.txt') as f:
    for line in f:
        color_channel, images_and_intensities = line.split('\t')
        images_and_intensities = eval(images_and_intensities)
        image_names = [image_name for image_name, color_intensity in images_and_intensities]        
        color_channels_and_pictures[color_channel] = image_names

In [6]:
import matplotlib.pyplot as plt
%matplotlib
from scipy.ndimage import imread
import numpy as np

def plots(ims, interp=False, titles=None):
    ims=np.array(ims)
    mn,mx=ims.min(),ims.max()
    f = plt.figure(figsize=(12,24))
    for i in range(len(ims)):
        sp=f.add_subplot(1, len(ims), i+1)
        if not titles is None: sp.set_title(titles[i], fontsize=18)
        plt.imshow(ims[i], interpolation=None if interp else 'none', vmin=mn,vmax=mx)

def plot(im, interp=False):
    f = plt.figure(figsize=(3,6), frameon=True)
    plt.imshow(im, interpolation=None if interp else 'none')

plt.gray()
plt.close()

Using matplotlib backend: Qt4Agg


In [None]:
input_dir = 'Cifar Pictures/'

channel_images = []
for image_name in color_channels_and_pictures['1'][:20]: # key becomes string
    channel_images.append(imread(input_dir + image_name))

plots(channel_images)

# Manual Map and Reduce Steps

In [1]:
%%writefile EMR_mapper.py
import sys
import json
import numpy as np

for line in sys.stdin:
    image_dict = json.loads(line)    
    try:
        image_dict = json.loads(line)
        ((image_name, image_data),) = image_dict.items()            
    except:
        assert False, 'something went wrong'

    color_averages = np.array(image_data).mean(axis=(0, 1))
    max_color_channel = np.argmax(color_averages)
    print("{}\t{}\t{}".format(
        int(max_color_channel), 
        image_name, 
        color_averages[max_color_channel]))
        # key has to be int, not np.int64

Writing EMR_mapper.py


In [2]:
%%writefile EMR_reducer.py
import sys

current_color_channel = None
image_names_color_intensities = None

for line in sys.stdin: # everything read in is a string!
    max_color_channel, image_name, max_color_intensity = line.strip().split('\t')
    if current_color_channel == max_color_channel:
        image_names_color_intensities.append(
            (image_name, float(max_color_intensity))
            )
    else:
        if current_color_channel: # if channel changes, print results
            print("{}\t{}".format(
                current_color_channel, 
                sorted(image_names_color_intensities, key=lambda tup: -tup[1])
                ))
        current_color_channel = max_color_channel
        image_names_color_intensities = []
        image_names_color_intensities.append(
            (image_name, float(max_color_intensity))
        )
        
if current_color_channel: # for last color channel
    print("{}\t{}".format(current_color_channel, sorted(
                image_names_color_intensities, key=lambda tup: -tup[1])
    ))

Writing EMR_reducer.py


In [3]:
%%time
!python EMR_mapper.py < cifar_data.txt > temp1.txt
!sort temp1.txt > temp2.txt
!python EMR_reducer.py < temp2.txt > temp3.txt

Wall time: 1min 29s


# Check if MRJob and manual map-reduce results match

In [1]:
color_channels_and_pictures = dict()
with open('temp_MRJob.txt') as f:
    for line in f:
        color_channel, images_and_intensities = line.split('\t')
        images_and_intensities = eval(images_and_intensities)
        image_names = [image_name for image_name, color_intensity in images_and_intensities]        
        color_channels_and_pictures[color_channel] = image_names

In [2]:
color_channels_and_pictures_manual = dict()
with open('temp3.txt') as f:
    for line in f:
        color_channel, images_and_intensities = line.split('\t')
        images_and_intensities = eval(images_and_intensities)
        image_names = [image_name for image_name, color_intensity in images_and_intensities]        
        color_channels_and_pictures_manual[color_channel] = image_names

In [3]:
print(color_channels_and_pictures == color_channels_and_pictures_manual)

True


# Tutorial

In [1]:
%%writefile mr_word_count.py

from mrjob.job import MRJob


class MRWordFrequencyCount(MRJob):

    def mapper(self, _, line):
        yield "chars", len(line)
        yield "words", len(line.split())
        yield "lines", 1

    def reducer(self, key, values):
        yield key, sum(values)


if __name__ == '__main__':
    MRWordFrequencyCount.run()

Writing mr_word_count.py


In [2]:
!python mr_word_count.py < shakespeare.txt

"chars"	5333743
"lines"	124456
"words"	901325


No configs found; falling back on auto-configuration
Creating temp directory c:\users\eugene\appdata\local\temp\mr_word_count.Eugene.20170717.031112.880000
Running step 1 of 1...
reading from STDIN
Streaming final output from c:\users\eugene\appdata\local\temp\mr_word_count.Eugene.20170717.031112.880000\output...
Removing temp directory c:\users\eugene\appdata\local\temp\mr_word_count.Eugene.20170717.031112.880000...


In [3]:
%%writefile mr_word_counter.py
from mrjob.job import MRJob


class MRWordFrequencyCount(MRJob):

    def mapper(self, _, line):
        for word in line.lower().split():
            yield (word, 1)

    def combiner(self, word, aggregated_counts):
        yield word, sum(aggregated_counts)

    def reducer(self, key, count):
        yield key, sum(count)


if __name__ == '__main__':
    MRWordFrequencyCount.run()

Writing mr_word_counter.py


In [4]:
%%time
!python mr_word_counter.py < shakespeare.txt > temp_shakespeare_counter_results.txt
!cat temp_shakespeare_counter_results.txt | sort --key 2nr -n | head -20
# sort by second key in reverse order

No configs found; falling back on auto-configuration
Creating temp directory c:\users\eugene\appdata\local\temp\mr_word_counter.Eugene.20170717.031118.295000
Running step 1 of 1...
reading from STDIN
Streaming final output from c:\users\eugene\appdata\local\temp\mr_word_counter.Eugene.20170717.031118.295000\output...
Removing temp directory c:\users\eugene\appdata\local\temp\mr_word_counter.Eugene.20170717.031118.295000...


"the"	27549
"and"	26037
"i"	19540
"to"	18700
"of"	18010
"a"	14383
"my"	12455
"in"	10671
"you"	10630
"that"	10487
"is"	9145
"for"	7982
"with"	7931
"not"	7643
"your"	6871
"his"	6749
"be"	6700
"but"	5886
"he"	5884
"as"	5882
Wall time: 12.2 s


sort: write failed: 'standard output'
sort: write error


In [5]:
from collections import Counter

counter_manual = Counter()
with open('shakespeare.txt') as f:
    for line in f:
        counter_manual.update(line.lower().split())

print(counter_manual.most_common()[:10])

[('the', 27549), ('and', 26037), ('i', 19540), ('to', 18700), ('of', 18010), ('a', 14383), ('my', 12455), ('in', 10671), ('you', 10630), ('that', 10487)]


In [6]:
counter_mapreduce = Counter()

with open('temp_shakespeare_counter_results.txt') as f:
    for line in f:
        word, count = line.strip().split('\t')
        counter_mapreduce[word.strip('"')] = int(count)

print(counter_mapreduce.most_common()[:10])

[('the', 27549), ('and', 26037), ('i', 19540), ('to', 18700), ('of', 18010), ('a', 14383), ('my', 12455), ('in', 10671), ('you', 10630), ('that', 10487)]


In [7]:
(counter_manual - counter_mapreduce).most_common()[:10]
# close enough!

[('"', 241),
 ('"a', 4),
 ('"i', 4),
 ('sail!"', 3),
 ('print!"', 3),
 ('"small', 3),
 ('"caesar."', 2),
 ('"thus', 2),
 ('"fear', 2),
 ('"give', 2)]