In [2]:
sc

<pyspark.context.SparkContext at 0x112f31bd0>

In [3]:
rdd = sc.textFile("/Users/Enzo/Downloads/spark/README.md")

In [4]:
rdd.take(10)

[u'# Apache Spark',
 u'',
 u'Spark is a fast and general cluster computing system for Big Data. It provides',
 u'high-level APIs in Scala, Java, Python, and R, and an optimized engine that',
 u'supports general computation graphs for data analysis. It also supports a',
 u'rich set of higher-level tools including Spark SQL for SQL and DataFrames,',
 u'MLlib for machine learning, GraphX for graph processing,',
 u'and Spark Streaming for stream processing.',
 u'',
 u'<http://spark.apache.org/>']

In [5]:
# WORD COUNT problem 
# 1. Splits each word by each ' ' and puts it in a list
# flatmap will gets rid off empty lists, and puts all elements in one list || also puts a 'u' next to each word
# 2. map func makes everything lower case and assigns and attaches a 1 to it || ex: [..., (u'spark', 1), ...]
# 3. reduceByKey func adds the previous element to the next element, it elemenets match it adds the key to it

rdd.flatMap(lambda x: x.split()) \
    .map(lambda x: (x.lower(),1)) \
    .reduceByKey(lambda x,y: x+y) \
    .take(10)
    

[(u'when', 1),
 (u'alternatively,', 1),
 (u'"local"', 1),
 (u'including', 4),
 (u'computation', 1),
 (u'note', 1),
 (u'submit', 1),
 (u'using:', 1),
 (u'guidance', 2),
 (u'environment', 1)]

In [14]:
# Another approach for WORD COUNT problem
# 1. groupByKey func returns a pair (K,V) where V is an iterable. In this case it has the counts added up
# 2. mapValues func sums the iterable from each pair

rdd.flatMap(lambda x: x.split()) \
    .map(lambda x: (x.lower(),1)) \
    .groupByKey() \
    .mapValues(lambda x: sum(x)) \
    .take(10)

[(u'when', 1),
 (u'alternatively,', 1),
 (u'"local"', 1),
 (u'including', 4),
 (u'computation', 1),
 (u'note', 1),
 (u'submit', 1),
 (u'using:', 1),
 (u'guidance', 2),
 (u'environment', 1)]

In [46]:
# textFile() EXPLINATION
# 1. minPartitions will be passed to Hadoop's InputFormat.getSplits. The parameter is a hint, so you may get more 
# or less partitions, depending on the Hadoop InputFormat implementation.
# Logically split the set of input files for the job. Each InputSplit is then assigned to an individual 
# Mapper for processing

chunks = sc.textFile("/Users/Enzo/Downloads/spark/README.md", minPartitions=3) # Sets it to aleast 3. By default is 2
chunks.getNumPartitions()

3

In [47]:
# Another approach for WORD COUNT problem 
# 1. wc is the chunks and each chunk gets applied mapPartitions and reduceBykey funcs
# 2. mapPartitions func works for every partition (block) and is similar to regular map func
# 1. k1v1 is a list of lists - holding the counts for each word
# 2. at the end wc holds all the chunks together

def mapper(k1v1s): 
    for k1v1 in k1v1s:
        for word in k1v1.split():
            yield (word.lower(),1)

wc = chunks.mapPartitions(mapper).reduceByKey(lambda x, y: x+y)
wc.take(2)

[(u'when', 1), (u'alternatively,', 1)]

In [58]:
#TOP 3 - WORD COUNT problem
# 1. heapq is a priority queue
# 2. nlargest(n, iterable[, key]) This function returns the n largest elements in the given iterable.
# 3. mapper2 gets applied to each chunck in wc. Then yields the top3 words with the highest count. The key suggests
# where to apply the between elements comparisons for the priority queue (heapq)
# 4. reducer2 grabs the top 3 words with the highest count from each chunck and adds them up
# 5. collect() - Return all the elements of the dataset as an array at the driver program. This is usually useful 
# after a filter or other operation that returns a sufficiently small subset of the data.

import heapq 

def mapper2(counts):
    yield (0, heapq.nlargest(3, counts, key=lambda x: x[1]))
    
def reducer2(top31,top32):
    return heapq.nlargest(3, top31 + top32, key=lambda x: x[1])

top3  = wc.mapPartitions(mapper2).reduceByKey(reducer2)
top3.collect()

[(0, [(u'the', 25), (u'to', 19), (u'spark', 16)])]

In [54]:
# Anotehr approach -- TOP 3 - WORD COUNT problem
# !. reduce func is used here instead. It get rid off the starting 0 from the previous output

import heapq

def mapper2(counts):
    yield heapq.nlargest(3, counts, key=lambda x: x[1])
    
def reducer2(top31,top32):
    return heapq.nlargest(3, top31 + top32, key=lambda x: x[1])


top3 = wc.mapPartitions(mapper2).reduce(reducer2)
top3

[(u'the', 25), (u'to', 19), (u'spark', 16)]

In [60]:
# why cache? Spark will keep the elements around on the cluster for much faster access the next time you query it
trips = sc.textFile("citibike.csv").cache()

In [66]:
# CITIBIKE Problem
# 1. mapPartitionWithIndex func -- Similar to mapPartitions, but also provides func with an integer value 
# representing the index of the partition


def mapper3(partId, records):
    if partId==0:
        records.next()
        
    import csv
    reader = csv.reader(records)
    for trip in reader:
        yield trip[2]

output = trips.mapPartitionsWithIndex(mapper3)
output.take(4)

['801', '379', '2474', '818']