# Generating Data

In [1]:
rdd = sc.parallelize([1,2,3,4,5])
print rdd.collect()

[1, 2, 3, 4, 5]


In [1]:
rdd = sc.parallelize(xrange(1,31,3))
print rdd.collect()

[1, 4, 7, 10, 13, 16, 19, 22, 25, 28]


In [2]:
rdd = sc.range(1,31,3)
print rdd.collect()

[1, 4, 7, 10, 13, 16, 19, 22, 25, 28]


# Loading Data from HDFS

In [3]:
rdd = sc.textFile('/user/cloudera/alice.txt')
print rdd.collect()[0:10]

[u"Project Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll", u'', u'This eBook is for the use of anyone anywhere at no cost and with', u'almost no restrictions whatsoever.  You may copy it, give it away or', u're-use it under the terms of the Project Gutenberg License included', u'with this eBook or online at www.gutenberg.org', u'', u'', u"Title: Alice's Adventures in Wonderland", u'']


# Retrieving Data

In [4]:
rdd = sc.range(1,10)
print rdd.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [5]:
rdd = sc.range(1,1000)
print rdd.first()

1


In [6]:
rdd = sc.textFile('/user/cloudera/alice.txt')
print rdd.take(10)

[u"Project Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll", u'', u'This eBook is for the use of anyone anywhere at no cost and with', u'almost no restrictions whatsoever.  You may copy it, give it away or', u're-use it under the terms of the Project Gutenberg License included', u'with this eBook or online at www.gutenberg.org', u'', u'', u"Title: Alice's Adventures in Wonderland", u'']


In [7]:
rdd = sc.textFile('/user/cloudera/alice.txt')
print rdd.takeSample(True, 10)

[u"'The lobsters!' shouted the Gryphon, with a bound into the air.", u'array of equipment including outdated equipment.  Many small donations', u'from the public domain (does not contain a notice indicating that it is', u'The Queen turned crimson with fury, and, after glaring at her for a', u'or PGLAF), owns a compilation copyright in the collection of Project', u'', u"rule, 'and vinegar that makes them sour--and camomile that makes", u"'Sure, it does, yer honour: but it's an arm for all that.'", u"There was a dead silence instantly, and Alice thought to herself, 'I", u'']


# Simple Transformations: Map

In [8]:
rdd = sc.range(1,10)
result = rdd.map(lambda x: x**2)
print result.collect()

[1, 4, 9, 16, 25, 36, 49, 64, 81]


In [9]:
rdd = sc.textFile('/user/cloudera/alice.txt')
result = rdd.map(lambda x: x.split())
words = result.take(20)
for w in words:
    print w

[u'Project', u"Gutenberg's", u"Alice's", u'Adventures', u'in', u'Wonderland,', u'by', u'Lewis', u'Carroll']
[]
[u'This', u'eBook', u'is', u'for', u'the', u'use', u'of', u'anyone', u'anywhere', u'at', u'no', u'cost', u'and', u'with']
[u'almost', u'no', u'restrictions', u'whatsoever.', u'You', u'may', u'copy', u'it,', u'give', u'it', u'away', u'or']
[u're-use', u'it', u'under', u'the', u'terms', u'of', u'the', u'Project', u'Gutenberg', u'License', u'included']
[u'with', u'this', u'eBook', u'or', u'online', u'at', u'www.gutenberg.org']
[]
[]
[u'Title:', u"Alice's", u'Adventures', u'in', u'Wonderland']
[]
[u'Author:', u'Lewis', u'Carroll']
[]
[u'Posting', u'Date:', u'June', u'25,', u'2008', u'[EBook', u'#11]']
[u'Release', u'Date:', u'March,', u'1994']
[u'[Last', u'updated:', u'December', u'20,', u'2011]']
[]
[u'Language:', u'English']
[]
[]
[u'***', u'START', u'OF', u'THIS', u'PROJECT', u'GUTENBERG', u'EBOOK', u"ALICE'S", u'ADVENTURES', u'IN', u'WONDERLAND', u'***']


In [10]:
rdd = sc.textFile('/user/cloudera/alice.txt')
result = rdd.flatMap(lambda x: x.split())
words = result.take(30)
for w in words:
    print w

Project
Gutenberg's
Alice's
Adventures
in
Wonderland,
by
Lewis
Carroll
This
eBook
is
for
the
use
of
anyone
anywhere
at
no
cost
and
with
almost
no
restrictions
whatsoever.
You
may
copy


In [11]:
rdd = sc.textFile('/user/cloudera/alice.txt')
result = rdd.flatMap(lambda x: x.split())
words = result.takeSample(True, 30)
for w in words:
    print w

would
clearer
hardly
'--and
way,
personal
to
But
it
double
said
marked
and
not
said
I've
the
moment
you
in
unable
IN
sounded
tell
THE
his
it.
any
Project
beginning!'


# Filtering Data

In [12]:
rdd = sc.parallelize([1, 2, 3, 4, 5])
result = rdd.filter(lambda x: x % 2 == 0).collect()
print result

[2, 4]


# Aggregations

In [13]:
rdd = sc.range(1,10)
print "Count=" + str(rdd.count())
print "Sum=" + str(rdd.sum())
print "Mean=" + str(rdd.mean())
print "Min=" + str(rdd.min())
print "Max=" + str(rdd.max())
print "Variance=" + str(rdd.variance())
print "Stddev=" + str(rdd.stdev())
print "SampleVariance=" + str(rdd.sampleVariance())
print "SampleStddev=" + str(rdd.sampleStdev())

Count=9
Sum=45
Mean=5.0
Min=1
Max=9
Variance=6.66666666667
Stddev=2.58198889747
SampleVariance=7.5
SampleStddev=2.73861278753


In [14]:
rdd = sc.range(1,10)
result = rdd.stats()
print result

(count: 9, mean: 5.0, stdev: 2.58198889747, max: 9.0, min: 1.0)


In [15]:
rdd = sc.range(1,10)
result = rdd.reduce(lambda x,y: x+y)
print result

45


In [16]:
zero = (0.0, 0)
reducer = lambda acc,value: (acc[0] + value, acc[1] + 1)
combiner = lambda left,right: (left[0] + right[0], left[1] + right[1])

rdd = sc.range(1,10)
result = rdd.aggregate(zero, reducer, combiner)

print result[0]/result[1]

5.0


# Making Data Distinct

In [17]:
rdd = sc.parallelize(['b','a','c','c','a'])
result = rdd.distinct()
print sorted(result.collect())

['a', 'b', 'c']


# Working with Key-Value Data

In [18]:
rdd = sc.parallelize([("a",1), ("b",3), ("c", 17), ("b", 23), ("c",12)])
print rdd.countByKey().items()

[('a', 1), ('c', 2), ('b', 2)]


In [19]:
rdd = sc.parallelize(["a", "b", "c", "b", "c"])
print rdd.countByValue().items()

[('a', 1), ('c', 2), ('b', 2)]


In [20]:
rdd = sc.parallelize([("a",1), ("b",3), ("c", 17), ("b", 23), ("c",12)])
result = rdd.reduceByKey(lambda x,y: x+y)
print result.collect()

[('a', 1), ('c', 29), ('b', 26)]


In [21]:
zero = (0.0, 0)
reducer = lambda acc,value: (acc[0] + value, acc[1] + 1)
combiner = lambda left,right: (left[0] + right[0], left[1] + right[1])

rdd = sc.parallelize([("a",1), ("b",3), ("c", 17), ("b", 23), ("c",12)])
result = rdd.aggregateByKey(zero, reducer, combiner).mapValues(lambda (x,y): x/y)
print result.collect()

[('a', 1.0), ('c', 14.5), ('b', 13.0)]


# Sorting Data

In [22]:
rdd = sc.parallelize(['b','a','c','c','a'])
result = rdd.sortBy(lambda x: x, ascending=False)
print result.collect()

['c', 'c', 'b', 'a', 'a']


In [23]:
rdd = sc.parallelize([("a",1), ("b",3), ("c", 17), ("b", 23), ("c",12)])
result = rdd.sortByKey()
print result.collect()

[('a', 1), ('b', 23), ('b', 3), ('c', 17), ('c', 12)]


# Grouping Data

In [24]:
rdd = sc.parallelize([("a",1), ("b",3), ("a", 17), ("c", 23), ("b",12)])
result = rdd.groupByKey()
print sorted(map(lambda x: (x[0],list(x[1])), result.collect()))

[('a', [17, 1]), ('b', [3, 12]), ('c', [23])]


In [25]:
rdd = sc.range(1,20)
result = rdd.groupBy(lambda x: "Remainder=" + str(x % 3))
print sorted(map(lambda x: (x[0],list(x[1])), result.collect()))

[('Remainder=0', [12, 15, 18, 3, 6, 9]), ('Remainder=1', [1, 4, 7, 10, 13, 16, 19]), ('Remainder=2', [11, 14, 17, 2, 5, 8])]


In [26]:
rdd = sc.parallelize([("a",1), ("b",3), ("a", 17), ("c", 23), ("b",12)])
result = rdd.groupByKey().mapValues(lambda iter:sum(iter))
print sorted(result.collect())

[('a', 18), ('b', 15), ('c', 23)]


In [27]:
rdd = sc.parallelize([("a",1), ("b",3), ("a", 17), ("c", 23), ("b",12)])
result = rdd.reduceByKey(lambda x,y: x + y)
print sorted(result.collect())

[('a', 18), ('b', 15), ('c', 23)]


# Joining Data

In [28]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("c", 8)])
result = x.fullOuterJoin(y).collect()
print sorted(result)

[('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))]


In [29]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("c", 8)])
result = x.join(y).collect()
print sorted(result)

[('a', (1, 2))]


In [30]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("c", 8)])
result = x.leftOuterJoin(y).collect()
print sorted(result)

[('a', (1, 2)), ('b', (4, None))]


In [31]:
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("c", 8)])
result = x.rightOuterJoin(y).collect()
print sorted(result)

[('a', (1, 2)), ('c', (None, 8))]


# Sampling and Splitting

In [32]:
rdd = sc.range(0,1000)
result = rdd.sample(False, 0.8, seed=124)
print result.count()

800


In [33]:
rdd = sc.range(0,1000)
a,b = rdd.randomSplit([8,2], seed=12)
print "#A: " + str(a.count())
print "#B: " + str(b.count())

#A: 803
#B: 197


# Caching Data

In [1]:
rdd = sc.range(0,1000)
rdd.setName("0 to 1000")
rdd.cache()
rdd.reduce(lambda x,y:x + y)

499500

In [2]:
rdd.unpersist()

0 to 1000 PythonRDD[1] at RDD at PythonRDD.scala:43

# Storing Data

In [35]:
rdd = sc.range(0,100)
rdd.saveAsTextFile('/user/cloudera/numbers')

## Retrieving Data

    hdfs dfs -getmerge /user/cloudera/numbers /tmp/numbers.txt
    cat /tmp/numbers.txt | head -n10

# WordCount Revisited

In [37]:
text = sc.textFile('/user/cloudera/alice.txt')
words = text.flatMap(lambda x: x.split()).map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y).sortBy(lambda x: x[1], ascending=False).map(lambda (k,v): k + ':' + str(v))
words.saveAsTextFile('/user/cloudera/alice_counts')

## Retrieving Data

    hdfs dfs -getmerge /user/cloudera/alice_counts /tmp/alice_counts.txt
    cat /tmp/alice_counts.txt | head -n20