In [1]:
# Snakebite is pretty easy to install!
#   sudo pip install snakebite
# Note that "import snakebite" doesn't seem to do much. You need to import this way.

from snakebite.client import Client

In [2]:
# This starts up our client connection.
#   "Localhost" is the hostname or IP adress of where the NameNode is.
#       ... in production it would not be localhost
#   9000 is the default port for NameNode communications.
#   use_trash is just telling the client to not put things in the trash when you "rm" them.
client = Client("localhost", 9000, use_trash=False)

In [3]:
# These are some of the commands available to you through Snakebite
[ d for d in dir(client) if d[0] != '_']

['COUNT_ATTRIBUTES',
 'FILETYPES',
 'LISTING_ATTRIBUTES',
 'cat',
 'chgrp',
 'chmod',
 'chown',
 'copyToLocal',
 'count',
 'delete',
 'df',
 'du',
 'getmerge',
 'host',
 'ls',
 'mkdir',
 'port',
 'rename',
 'rename2',
 'rmdir',
 'serverdefaults',
 'service',
 'service_stub_class',
 'setrep',
 'stat',
 'tail',
 'test',
 'text',
 'touchz',
 'trash',
 'use_trash']

In [4]:
# Here is an example of "ls"
for x in client.ls(['/user/python']):
    print x

{'group': u'supergroup', 'permission': 493, 'file_type': 'd', 'access_time': 0L, 'block_replication': 0, 'modification_time': 1428425016161L, 'length': 0L, 'blocksize': 0L, 'owner': u'python', 'path': '/user/python/indata'}
{'group': u'supergroup', 'permission': 493, 'file_type': 'd', 'access_time': 0L, 'block_replication': 0, 'modification_time': 1428424999362L, 'length': 0L, 'blocksize': 0L, 'owner': u'python', 'path': '/user/python/outdata'}
{'group': u'supergroup', 'permission': 493, 'file_type': 'd', 'access_time': 0L, 'block_replication': 0, 'modification_time': 1428203246026L, 'length': 0L, 'blocksize': 0L, 'owner': u'python', 'path': '/user/python/pgout'}
{'group': u'supergroup', 'permission': 493, 'file_type': 'd', 'access_time': 0L, 'block_replication': 0, 'modification_time': 1428201343597L, 'length': 0L, 'blocksize': 0L, 'owner': u'python', 'path': '/user/python/salarydata'}
{'group': u'supergroup', 'permission': 493, 'file_type': 'd', 'access_time': 0L, 'block_replication'

In [5]:
# Note that you can use snakebite at the command line!
# It's quite faster than hadoop fs -ls because it doesn't start up a jvm every time
!snakebite ls /user/python

# note: if you didn't know, in ipython using a ! runs a shell command.

Found 6 items
drwxr-xr-x   - python     supergroup          0 2015-04-07 12:43 /user/python/indata
drwxr-xr-x   - python     supergroup          0 2015-04-07 12:43 /user/python/outdata
drwxr-xr-x   - python     supergroup          0 2015-04-04 23:07 /user/python/pgout
drwxr-xr-x   - python     supergroup          0 2015-04-04 22:35 /user/python/salarydata
drwxr-xr-x   - python     supergroup          0 2015-04-04 22:35 /user/python/wcdata
-rw-r--r--   1 python     supergroup      29887 2015-04-07 12:14 /user/python/wordcount.py.out


In [6]:
# This script takes a look at the top words from the word count output
#    Thought: Maybe it would be a good follow-on process to the mrjob script?

import string

s = []

# wordcount.py.out was created by the wordcount.py mrjob script
for f in client.cat(['wordcount.py.out']):
    # cat creates one generator per item in parenthesis
    for g in f:
        # it then creates one generator per chunk of data
        s.append(str(g)) 

# reconstitute all the data together
s = ''.join(s)

# see if we can pull out some important names from Pride & Prejudice
for line in s.split('\n'):
    if len(line.strip()) == 0: continue
    
    count = int(line.split('\t')[-1])
    if count > 200:
        print line


"bennet"	293
"bingley"	257
"could"	525
"darcy"	371
"elizabeth"	594
"jane"	263
"know"	239
"may"	207
"miss"	283
"mr"	783
"mrs"	343
"much"	328
"must"	318
"never"	220
"one"	266
"said"	401
"soon"	216
"think"	211
"though"	220
"well"	212
"would"	469


In [6]:
# Exercise: Take the list of poorly named files in indata and move them to outdata with new names
#  Snakebite is great at implementing automated data organization tasks like this

# You can ignore the following. These shell commands are setting up the exercise for you.
# Note that the hadoop commands can take a few seconds each.
print "Running the setup commands..."
!hadoop fs -rm -r /user/python/outdata
!hadoop fs -rm -r /user/python/indata
!hadoop fs -mkdir /user/python/indata
!hadoop fs -mkdir /user/python/outdata
!echo "this is a file" | hadoop fs -put - /user/python/indata/in1.txt
!echo "python is awesome" | hadoop fs -put - /user/python/indata/in2.txt
!echo "hadoop is fun" | hadoop fs -put - /user/python/indata/in3.txt
print "Done running the setup commands."

# Python starts here
import time

# you'll use this time stamp later
now = int(time.time())

# Using snakebite, do an "ls" on /user/python/indata/
# STOP! Test your "ls" with print to make sure you are listing the contents of indata
#   if you don't you might blow something up in the next step


# Iterate through each file in indata (output of ls) and move the file to outdata
#   rename the file using the timestamp. Use "rename" for this.

Running the setup commands...
15/04/07 12:43:06 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.
Deleted /user/python/outdata
15/04/07 12:43:10 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.
Deleted /user/python/indata
Done running the setup commands.
