# Map Reduce

## Write the Mapper

In [1]:
%%writefile mapper.py
#!/usr/bin/python3
import sys

for line in sys.stdin:
    line = line.strip()
    words = line.split()
    for word in words:
        print(f"{word}\t1")

Writing mapper.py


## Write the Reducer

In [2]:
%%writefile reducer.py
#!/usr/bin/python3

import sys
current_word = None
current_count = 0
word = None
for line in sys.stdin:
    line = line.strip()
    try:
        # convert count (currently a string) to int
        word, count = line.split('\t', 1)
        count = int(count)
    except ValueError:
        # split was not right or count was not a number, so silently
        # ignore/discard this line
        continue
    if current_word == word:
        current_count += count
    else:
        if current_word:
            print(f"{current_word}\t{current_count}")
        current_count = count
        current_word = word

if current_word == word:
    print(f"{current_word}\t{current_count}")


Writing reducer.py


## Permission

In [17]:
!chmod 755 *.py

## Word Count with holmes.txt

In [12]:
!pwd

/home/hadoop/BDLC_FS22/V3/resources/wordcount_map_reduce


In [5]:
!hdfs dfs -ls /

Found 6 items
drwxr-xr-x   - hadoop supergroup          0 2022-03-06 13:14 /dataset
drwxr-xr-x   - hadoop supergroup          0 2022-03-02 13:45 /own_word_count
drwxr-xr-x   - hadoop supergroup          0 2022-03-02 13:47 /own_word_count_small_file
drwxr-xr-x   - hadoop supergroup          0 2022-03-02 10:38 /test
drwxrwx---   - hadoop supergroup          0 2022-03-02 07:41 /tmp
drwxr-xr-x   - hadoop supergroup          0 2022-03-02 07:39 /user


In [16]:
!hdfs dfs -rm -r /own_word_count

Deleted /own_word_count


In [None]:
!hadoop jar ~/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.1.jar \
-files /home/hadoop/BDLC_FS22/V3/resources/wordcount_map_reduce/ \
-mapper /home/hadoop/BDLC_FS22/V3/resources/wordcount_map_reduce/mapper.py \
-reducer /home/hadoop/BDLC_FS22/V3/resources/wordcount_map_reduce/reducer.py \
-input /dataset/text/holmes.txt \
-output /own_word_count

## Word Count with gutenberg_all.txt

In [22]:
!hdfs dfs -ls /dataset/text/

Found 3 items
-rw-r--r--   1 hadoop supergroup 10823456892 2022-03-06 13:15 /dataset/text/gutenberg_all.txt
-rw-r--r--   1 hadoop supergroup      607430 2022-03-06 13:14 /dataset/text/holmes.txt
-rw-r--r--   1 hadoop supergroup         342 2022-03-06 13:14 /dataset/text/small.txt


In [19]:
!hdfs dfs -rm -r /own_word_count

Deleted /own_word_count


In [23]:
# see htop and df -h 
# while true; do df -h | grep -e "Filesystem\|/$\|/data"; sleep 1; clear; done

In [24]:
!hadoop jar ~/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.1.jar \
-files /home/hadoop/BDLC_FS22/V3/resources/wordcount_map_reduce/ \
-mapper /home/hadoop/BDLC_FS22/V3/resources/wordcount_map_reduce/mapper.py \
-reducer /home/hadoop/BDLC_FS22/V3/resources/wordcount_map_reduce/reducer.py \
-input /dataset/text/gutenberg_all.txt \
-output /own_word_count

packageJobJar: [/tmp/hadoop-unjar4492664429248819026/] [] /tmp/streamjob13892874850664547949.jar tmpDir=null
2022-03-07 12:08:46,144 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at /0.0.0.0:8032
2022-03-07 12:08:46,311 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at /0.0.0.0:8032
2022-03-07 12:08:46,534 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoop/.staging/job_1646217482405_0011
2022-03-07 12:08:46,892 INFO mapred.FileInputFormat: Total input files to process : 1
2022-03-07 12:08:46,975 INFO mapreduce.JobSubmitter: number of splits:81
2022-03-07 12:08:47,153 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1646217482405_0011
2022-03-07 12:08:47,153 INFO mapreduce.JobSubmitter: Executing with tokens: []
2022-03-07 12:08:47,341 INFO conf.Configuration: resource-types.xml not found
2022-03-07 12:08:47,341 INFO resource.ResourceUtils: Unable to find 'resour