# Hadoop Streaming assignment 3: Name Count

In [1]:
%%writefile mapper.py


import sys
import re

from imp import reload
if sys.version[0] == '2':
    reload(sys)
    sys.setdefaultencoding("utf-8") # required to convert to unicode

def read_stopwords(file_path): #Read in stop words file
    return set(word.strip().lower() for word in open(file_path))

# All conditions must be met
# 1. The first character is not a digit (other characters can be digits).
# 2. The first character is uppercase, all the other characters that are letters are lowercase.

def is_name(word):
    if len(word) < 2:
        return False
    elif (word[0].isalpha() and word[0].isupper() and word[1:].islower()):
        return True
    else:
        return False

#read in stop words
stopwords = read_stopwords("stop_words_en.txt")

for line in sys.stdin:
    try:
        article_id, text = unicode(line.strip()).split('\t', 1)
    except ValueError as e:
        print("Error in mapper.py", e)
        continue

    text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
    words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
    
    for word in words:
        if word.lower() not in stopwords: #First filter out stop words
            name_flag = int(is_name(word))
            print ("%s\t%d\t%d" % (word.lower(), name_flag, 1))

Overwriting mapper.py


## Step 2. Create the reducer.

Create the reducer, which will accumulate the information after the mapper step. You may implement the combiner if you want. It can be useful from optimizing and speed up your computations (see the lectures from the Week 2 for more details).

In [2]:
%%writefile reducer.py

# Your code for reducer here.
import sys
from imp import reload
if sys.version[0] == '2':
    reload(sys)
    sys.setdefaultencoding("utf-8") # required to convert to unicode

current_key = None
name_total = 0
word_total = 0

for line in sys.stdin:
    try:
        key, name_count, word_count = line.strip().split('\t', 2)
        name_count = int(name_count)
        word_count = int(word_count)
    except ValueError as e:
        print("error in reducer.py", e)
        continue
    
    if current_key != key:
        if current_key:
            print ("%s\t%d\t%d" % (current_key, name_total, word_total))
       
        name_total = 0
        word_total = 0
        current_key = key
        
    name_total += name_count
    word_total += word_count
    
if current_key:
    print ("%s\t%d\t%d" % (current_key, name_total, word_total))

Overwriting reducer.py


In [28]:
%%writefile mapper2.py

# Your code for reducer here.
import sys
from imp import reload
if sys.version[0] == '2':
    reload(sys)
    sys.setdefaultencoding("utf-8") # required to convert to unicode

current_key = None
name_total = 0
word_total = 0

for line in sys.stdin:
    try:
        key, name_count, word_count = line.strip().split('\t', 2)
        name_count = int(name_count)
        word_count = int(word_count)
    except ValueError as e:
        print("error in mapper2.py", e)
        continue
    
    if current_key != key:
        if current_key:
            print ("%d\t%d\t%s" % (name_total, word_total, current_key))
       
        name_total = 0
        word_total = 0
        current_key = key
        
    name_total += name_count
    word_total += word_count
    
if current_key:
    print ("%d\t%d\t%s" % (name_total, word_total, current_key))

Overwriting mapper2.py


In [31]:
%%writefile reducer2.py

# Your code for reducer here.
import sys
from imp import reload
if sys.version[0] == '2':
    reload(sys)
    sys.setdefaultencoding("utf-8") # required to convert to unicode


current_key = None
name_total = 0
word_total = 0

for line in sys.stdin:
    try:
        name_count, word_count, key = line.strip().split('\t', 2)
        name_count = int(name_count)
        word_count = int(word_count)
    except ValueError as e:
        print("error in reducer2.py", e)
        continue
    
    if current_key != key:
        if current_key and (float(name_total) / float(word_total) >= 0.995):
            print ("%s\t%d" % (current_key, name_total))
       
        name_total = 0
        word_total = 0
        current_key = key
        
    name_total += name_count
    word_total += word_count
    
if current_key and (float(name_total) / float(word_total) >= 0.995):
    print ("%s\t%d" % (current_key, name_total))

Overwriting reducer2.py


In [34]:
%%bash

#OUT_DIR="coursera_mr_name_count_task3"$(date +"%s%6N")
OUT_DIR_JOB1="namecount_job1_"$(date +"%s%6N")
OUT_DIR_JOB2="namecount_job2_"$(date +"%s%6N")
NUM_REDUCERS=8
LOGS="stderr_logs.txt"

yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \
    -D mapreduce.job.name="Streaming NameCountTask3 Job 1" \
    -D mapreduce.job.reduces=${NUM_REDUCERS} \
    -files mapper.py,reducer.py,/datasets/stop_words_en.txt \
    -mapper "python mapper.py" \
    -combiner "python reducer.py" \
    -reducer "python reducer.py" \
    -input /data/wiki/en_articles_part \
    -output ${OUT_DIR_JOB1} > /dev/null

 yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \
    -D mapreduce.job.output.key.comparator.class=org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedComparator \
    -D mapreduce.job.name="Streaming NameCountTask3 Job 2" \
    -D stream.map.output.field.separator="\t" \
    -D mapreduce.partition.keycomparator.options="-k1,3nr" \
    -D mapreduce.job.reduces=1 \
    -files mapper2.py,reducer2.py \
    -mapper "python mapper2.py" \
    -reducer "python reducer2.py" \
    -input ${OUT_DIR_JOB1} \
    -output ${OUT_DIR_JOB2} > /dev/null

hdfs dfs -cat ${OUT_DIR_JOB2}/part-00000 | head -5 | tail -1
# Code for obtaining the results
#hdfs dfs -cat ${OUT_DIR_2}/part-00000 | sed -n "5p;8q"

hdfs dfs -rm -r -skipTrash ${OUT_DIR_JOB1} > /dev/null
hdfs dfs -rm -r -skipTrash ${OUT_DIR_JOB2} > /dev/null

#Not sure if the "*" is needed or not!
#hdfs dfs -rm -r -skipTrash ${OUT_DIR_JOB1}* > /dev/null
#hdfs dfs -rm -r -skipTrash ${OUT_DIR_JOB2}* > /dev/null

french	5742


18/12/21 05:58:41 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
18/12/21 05:58:41 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
18/12/21 05:58:42 INFO mapred.FileInputFormat: Total input files to process : 1
18/12/21 05:58:42 INFO mapreduce.JobSubmitter: number of splits:2
18/12/21 05:58:42 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1545365530829_0041
18/12/21 05:58:42 INFO impl.YarnClientImpl: Submitted application application_1545365530829_0041
18/12/21 05:58:42 INFO mapreduce.Job: The url to track the job: http://3bfb327c519e:8088/proxy/application_1545365530829_0041/
18/12/21 05:58:42 INFO mapreduce.Job: Running job: job_1545365530829_0041
18/12/21 05:58:47 INFO mapreduce.Job: Job job_1545365530829_0041 running in uber mode : false
18/12/21 05:58:47 INFO mapreduce.Job:  map 0% reduce 0%
18/12/21 05:59:03 INFO mapreduce.Job:  map 67% reduce 0%
18/12/21 05:59:08 INFO mapreduce.Job:  map 83% reduce 0%
18/12/21 05:59:09 INFO 