# Hadoop Streaming assignment 3: Name Count
Make WordCount program for all the names in the dataset. Name is a word with the following properties:

* The first character is not a digit (other characters can be digits).
* The first character is uppercase, all the other characters that are letters are lowercase.
* There are less than 0.5% occurrences of this word, when this word regardless to its case appears in the dataset and the condition (2) is not met.

Order by quantity, most popular first, output format:

<code>name <tab> count</code>

The result is the 5th line in the output.

The result on the sample dataset:

<code>french 5742</code>

If you want to deploy the environment on your own machine, please use bigdatateam/yarn-notebook Docker container.

In [106]:
%%writefile test.dat

1	hello 2world
2	A hello Again2
42	Привет мир!
3	Another example with HeLLo
4	For The Horde!
4	For The Ho2rde!
4	For The hor4de!
4	For The 5Horde!
5	Hello, my name is! THe the tHe
1	$934h)) ,)34%%%0@
43	नमस्ते दुनिया!
4	Horde Horde Horde!

Overwriting test.dat


cat test.dat

In [107]:
%%writefile mapper1.py

import sys
import re

reload(sys)
sys.setdefaultencoding('utf-8')

total = 0

for line in sys.stdin:
    try:
        article_id, text = unicode(line.strip()).split('\t', 1)
    except ValueError as e:
        continue
    text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
    words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
    for word in words:
        total += 1       
        cond1 = word[0].isalpha()
        cond2 = not word[0].islower() and word[1:].islower()        
        print "%s\t%d\t%d\t%d" % (word.lower(), 1, cond1, cond2)
        
print >> sys.stderr, "reporter:counter:Wiki stats,Total words,%d" % total

Overwriting mapper1.py


In [115]:
cat test.dat | python2 ./mapper1.py | sort

reporter:counter:Wiki stats,Total words,37
2world	1	0	1
34%%%0	1	0	0
5horde	1	0	0
934h	1	0	1
a	1	1	0
again2	1	1	1
another	1	1	1
example	1	1	0
for	1	1	1
for	1	1	1
for	1	1	1
for	1	1	1
hello	1	1	0
hello	1	1	0
hello	1	1	0
hello	1	1	1
ho2rde	1	1	1
hor4de	1	1	0
horde	1	1	1
horde	1	1	1
horde	1	1	1
horde	1	1	1
is	1	1	0
my	1	1	0
name	1	1	0
the	1	1	0
the	1	1	0
the	1	1	0
the	1	1	1
the	1	1	1
the	1	1	1
the	1	1	1
with	1	1	0
мир	1	1	0
привет	1	1	1
दुनिय	1	1	0
नमस्त	1	1	0


In [116]:
%%writefile reducer1.py

import sys

key = None
word_sum = 0

for line in sys.stdin:
    try:
        word, count, cond1, cond2 = line.strip().split('\t', 3)
        count = int(count)
        cond1 = int(cond1)
        cond2 = int(cond2)
    except ValueError as e:
        continue
    
    if key != word:
        if key:
            print "%s\t%d\t%d\t%d" % (key, word_sum, cond1, cond2) # ERROR: cond1 and cond2 are taken only for the last word
        word_sum = 0
        key = word
    word_sum += count

if key:
    print "%s\t%d\t%d\t%d" % (key, word_sum, cond1, cond2)

Overwriting reducer1.py


In [117]:
cat test.dat | python2 ./mapper1.py | sort | python2 ./reducer1.py

reporter:counter:Wiki stats,Total words,37
2world	1	0	0
34%%%0	1	0	0
5horde	1	0	1
934h	1	1	0
a	1	1	1
again2	1	1	1
another	1	1	0
example	1	1	1
for	4	1	0
hello	4	1	1
ho2rde	1	1	0
hor4de	1	1	1
horde	4	1	0
is	1	1	0
my	1	1	0
name	1	1	0
the	7	1	0
with	1	1	0
мир	1	1	1
привет	1	1	0
दुनिय	1	1	0
नमस्त	1	1	0


In [111]:
%%writefile mapper2.py

import sys
import re

reload(sys)
sys.setdefaultencoding('utf-8')

for line in sys.stdin:
    try:
        word, cnt = line.strip().split('\t', 1)
        cnt = int(cnt)
    except ValueError as e:
        continue    
    print "%d\t%s" % (cnt, word)

Overwriting mapper2.py


In [112]:
%%writefile reducer2.py

import sys

for line in sys.stdin:
    try:
        cnt, word = line.strip().split('\t', 1)
        cnt = int(cnt)
    except ValueError as e:
        continue
    print "%s\t%d" % (word, cnt)

Overwriting reducer2.py


In [113]:
cat test.dat | python2 ./mapper1.py | sort | python2 ./reducer1.py | python2 ./mapper2.py | sort -r | python2 ./reducer2.py

reporter:counter:Wiki stats,Total words,37
