# Low Level APIs

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [62]:
spark

In [4]:
spark.catalog.listTables()

[]

In [8]:
a = [1,2,3,4,5,6,8,10,23]
rdd1 = spark.sparkContext.parallelize(a, 2)
rdd1

ParallelCollectionRDD[4] at parallelize at PythonRDD.scala:195

In [9]:
rdd1.map(lambda x: x**2).collect()

[1, 4, 9, 16, 25, 36, 64, 100, 529]

In [17]:
df = spark.sparkContext.textFile('./2010-12-01.csv')
df.take(10)

['InvoiceNo\tStockCode\tDescription\tQuantity\tInvoiceDate\tUnitPrice\tCustomerID\tCountry',
 '536365\t85123A\tWHITE HANGING HEART T-LIGHT HOLDER\t6\t2010-12-01 08:26:00\t2.55\t17850.0\tUnited Kingdom',
 '536365\t71053\tWHITE METAL LANTERN\t6\t2010-12-01 08:26:00\t3.39\t17850.0\tUnited Kingdom',
 '536365\t84406B\tCREAM CUPID HEARTS COAT HANGER\t8\t2010-12-01 08:26:00\t2.75\t17850.0\tUnited Kingdom',
 '536365\t84029G\tKNITTED UNION FLAG HOT WATER BOTTLE\t6\t2010-12-01 08:26:00\t3.39\t17850.0\tUnited Kingdom',
 '536365\t84029E\tRED WOOLLY HOTTIE WHITE HEART.\t6\t2010-12-01 08:26:00\t3.39\t17850.0\tUnited Kingdom',
 '536365\t22752\tSET 7 BABUSHKA NESTING BOXES\t2\t2010-12-01 08:26:00\t7.65\t17850.0\tUnited Kingdom',
 '536365\t21730\tGLASS STAR FROSTED T-LIGHT HOLDER\t6\t2010-12-01 08:26:00\t4.25\t17850.0\tUnited Kingdom',
 '536366\t22633\tHAND WARMER UNION JACK\t6\t2010-12-01 08:28:00\t1.85\t17850.0\tUnited Kingdom',
 '536366\t22632\tHAND WARMER RED POLKA DOT\t6\t2010-12-01 08:28:00\t1.85

In [None]:
df = spark.sparkContext.textFile

In [39]:
# 用python将DataFrame转化为RDD
df = spark.range(10)
df.show(2)

+---+
| id|
+---+
|  0|
|  1|
+---+
only showing top 2 rows



In [19]:
df.rdd

MapPartitionsRDD[25] at javaToPython at NativeMethodAccessorImpl.java:0

In [30]:
df.rdd.map(lambda x: x[0]).collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [11]:
spark.range(10).rdd.map(lambda row: row).collect()

[Row(id=0),
 Row(id=1),
 Row(id=2),
 Row(id=3),
 Row(id=4),
 Row(id=5),
 Row(id=6),
 Row(id=7),
 Row(id=8),
 Row(id=9)]

In [42]:
mycollection = "Spark the definitive Guide"
words = spark.sparkContext.parallelize(mycollection.split(' '), 2)

In [44]:
words.distinct().count()

4

In [45]:
# filter
def startsWithS(individual):
    return individual.startswith('S')


In [46]:
words.filter(lambda word: startsWithS(word)).collect()

['Spark']

In [47]:
words2 = words.map(lambda word: (word, word[0], word.startswith('S')))

In [48]:
words2.collect()

[('Spark', 'S', True),
 ('the', 't', False),
 ('definitive', 'd', False),
 ('Guide', 'G', False)]

In [32]:
words.flatMap(lambda word: list(word)).collect()

['S',
 'p',
 'a',
 'r',
 'k',
 't',
 'h',
 'e',
 'd',
 'e',
 'f',
 'i',
 'n',
 'i',
 't',
 'i',
 'v',
 'e',
 'G',
 'u',
 'i',
 'd',
 'e']

In [31]:
words.collect()

['Spark', 'the', 'definitive', 'Guide']

In [33]:
# sort
words.sortBy(lambda word: len(word) * -1).collect()

['definitive', 'Spark', 'Guide', 'the']

In [49]:
# reduce
spark.sparkContext.parallelize(range(1,21)).reduce(lambda x,y : x+y)

210

In [50]:
def wordLengthReducer(leftword, rightword):
    if len(leftword) > len(rightword):
        return leftword
    else:
        return rightword
    
words.reduce(wordLengthReducer)

'definitive'

In [51]:
words.getStorageLevel()

StorageLevel(False, False, False, False, 1)

In [54]:
# saving to files
words.saveAsTextFile('./save_words.txt')

In [56]:
# cache 可以选择 memory only， disk only 等选项
words.cache()

myWords ParallelCollectionRDD[73] at parallelize at PythonRDD.scala:195

In [57]:
words.getStorageLevel()

StorageLevel(False, True, False, False, 1)

In [60]:
# chekpointing save RDD 到磁盘上
# 首先要设定sparkcontext存储的路径
spark.sparkContext.setCheckpointDir('./')
words.checkpoint()

In [52]:
words.take(5)

['Spark', 'the', 'definitive', 'Guide']

## Advanced RDD

In [59]:
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple".split(" ")
words = spark.sparkContext.parallelize(myCollection, 2)

In [61]:
# key-value RDDs 最简单的是将其转变为key-value键值对
words.map(lambda word: (word.lower(), 1)).take(5)

[('spark', 1), ('the', 1), ('definitive', 1), ('guide', 1), (':', 1)]

In [62]:
# 通过keyby指定一个key
keyword = words.keyBy(lambda word: word.lower()[0]) # 使用第一个字母作为key
keyword.take(10)

[('s', 'Spark'),
 ('t', 'The'),
 ('d', 'Definitive'),
 ('g', 'Guide'),
 (':', ':'),
 ('b', 'Big'),
 ('d', 'Data'),
 ('p', 'Processing'),
 ('m', 'Made'),
 ('s', 'Simple')]

In [63]:
# mapping over values
keyword.mapValues(lambda word: word.upper()).collect()

[('s', 'SPARK'),
 ('t', 'THE'),
 ('d', 'DEFINITIVE'),
 ('g', 'GUIDE'),
 (':', ':'),
 ('b', 'BIG'),
 ('d', 'DATA'),
 ('p', 'PROCESSING'),
 ('m', 'MADE'),
 ('s', 'SIMPLE')]

In [64]:
keyword.map(lambda word: (word[0], word[1].upper())).collect()

[('s', 'SPARK'),
 ('t', 'THE'),
 ('d', 'DEFINITIVE'),
 ('g', 'GUIDE'),
 (':', ':'),
 ('b', 'BIG'),
 ('d', 'DATA'),
 ('p', 'PROCESSING'),
 ('m', 'MADE'),
 ('s', 'SIMPLE')]

In [65]:
# flatmap
keyword.flatMapValues(lambda word: word.upper()).collect()

[('s', 'S'),
 ('s', 'P'),
 ('s', 'A'),
 ('s', 'R'),
 ('s', 'K'),
 ('t', 'T'),
 ('t', 'H'),
 ('t', 'E'),
 ('d', 'D'),
 ('d', 'E'),
 ('d', 'F'),
 ('d', 'I'),
 ('d', 'N'),
 ('d', 'I'),
 ('d', 'T'),
 ('d', 'I'),
 ('d', 'V'),
 ('d', 'E'),
 ('g', 'G'),
 ('g', 'U'),
 ('g', 'I'),
 ('g', 'D'),
 ('g', 'E'),
 (':', ':'),
 ('b', 'B'),
 ('b', 'I'),
 ('b', 'G'),
 ('d', 'D'),
 ('d', 'A'),
 ('d', 'T'),
 ('d', 'A'),
 ('p', 'P'),
 ('p', 'R'),
 ('p', 'O'),
 ('p', 'C'),
 ('p', 'E'),
 ('p', 'S'),
 ('p', 'S'),
 ('p', 'I'),
 ('p', 'N'),
 ('p', 'G'),
 ('m', 'M'),
 ('m', 'A'),
 ('m', 'D'),
 ('m', 'E'),
 ('s', 'S'),
 ('s', 'I'),
 ('s', 'M'),
 ('s', 'P'),
 ('s', 'L'),
 ('s', 'E')]

In [66]:
# 直接获取key-value的key或者value
keyword.keys().collect()

['s', 't', 'd', 'g', ':', 'b', 'd', 'p', 'm', 's']

In [67]:
keyword.values().collect()

['Spark',
 'The',
 'Definitive',
 'Guide',
 ':',
 'Big',
 'Data',
 'Processing',
 'Made',
 'Simple']

In [68]:
keyword.lookup('s')

['Spark', 'Simple']

In [63]:
import random
distinctChars = words.flatMap(lambda word: list(word.lower())).distinct().collect()
sampleMap = dict(map(lambda c: (c, random.random()), distinctChars))

words.map(lambda word: (word.lower()[0], word))\
    .sampleByKey(True, sampleMap, 6).collect() # withreplacemment=True, fractions=sampleMap, seed=6

[('t', 'the'), ('t', 'the')]

In [64]:
# aggregation
chars = words.flatMap(lambda word: word.lower())
KVcharacters = chars.map(lambda letter: (letter, 1))

def maxFunc(left, right):
    return max(left, right)

def addFunc(left, right):
    return left + right

nums = spark.sparkContext.parallelize(range(1,31), 5)

In [65]:
# groupByKey
from functools import reduce
KVcharacters.groupByKey().map(lambda row: (row[0], reduce(addFunc, row[1]))).collect()

[('s', 1),
 ('p', 1),
 ('r', 1),
 ('h', 1),
 ('d', 2),
 ('i', 4),
 ('g', 1),
 ('a', 1),
 ('k', 1),
 ('t', 2),
 ('e', 4),
 ('f', 1),
 ('n', 1),
 ('v', 1),
 ('u', 1)]

In [66]:
KVcharacters.reduceByKey(addFunc).collect()

[('s', 1),
 ('p', 1),
 ('r', 1),
 ('h', 1),
 ('d', 2),
 ('i', 4),
 ('g', 1),
 ('a', 1),
 ('k', 1),
 ('t', 2),
 ('e', 4),
 ('f', 1),
 ('n', 1),
 ('v', 1),
 ('u', 1)]

In [88]:
KVcharacters.aggregateByKey(0, addFunc, maxFunc).collect()

[('s', 3),
 ('p', 2),
 ('r', 1),
 ('h', 1),
 ('d', 2),
 ('i', 4),
 ('g', 2),
 ('b', 1),
 ('c', 1),
 ('l', 1),
 ('a', 3),
 ('k', 1),
 ('t', 2),
 ('e', 4),
 ('f', 1),
 ('n', 1),
 ('v', 1),
 ('u', 1),
 (':', 1),
 ('o', 1),
 ('m', 2)]

In [89]:
def valToCombiner(value):
    return [value]

def mergeValuesFunc(vals, valToAppend):
    vals.append(valToAppend)
    return vals

def mergeCoombinerFunc(vals1, vals2):
    return vals1 + vals2

KVcharacters.combineByKey(valToCombiner, mergeValuesFunc, mergeCoombinerFunc, 6).collect()

[('s', [1, 1, 1, 1]),
 ('d', [1, 1, 1, 1]),
 ('l', [1]),
 ('v', [1]),
 (':', [1]),
 ('p', [1, 1, 1]),
 ('r', [1, 1]),
 ('c', [1]),
 ('k', [1]),
 ('t', [1, 1, 1]),
 ('n', [1, 1]),
 ('u', [1]),
 ('o', [1]),
 ('h', [1]),
 ('i', [1, 1, 1, 1, 1, 1, 1]),
 ('g', [1, 1, 1]),
 ('b', [1]),
 ('a', [1, 1, 1, 1]),
 ('e', [1, 1, 1, 1, 1, 1, 1]),
 ('f', [1]),
 ('m', [1, 1])]

In [91]:
numRange = spark.sparkContext.parallelize(range(10), 2)
words.zip(numRange).collect()

[('Spark', 0),
 ('The', 1),
 ('Definitive', 2),
 ('Guide', 3),
 (':', 4),
 ('Big', 5),
 ('Data', 6),
 ('Processing', 7),
 ('Made', 8),
 ('Simple', 9)]

In [92]:
retail = spark.read.format('csv')\
    .option('header', 'True')\
    .option('inferSchema', 'True')\
    .load('/Users/yanghao/github/data/retail-data/')

In [96]:
retail.show(2)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 2 rows



In [109]:
rdd = retail.coalesce(10).rdd

def partitionFunc(key):
    import random
    if key == 17850 and key == 12583:
        return 0
    else:
        return random.randint(1,2)

keyedRDD = rdd.keyBy(lambda row: row[6])
keyedRDD\
    .partitionBy(3, partitionFunc)\
    .map(lambda x: x[0])\
    .glom()\
    .map(lambda x: len(set(x)))\
    .take(5)

[0, 4298, 4306]

In [99]:
rdd.glom?

[0;31mSignature:[0m [0mrdd[0m[0;34m.[0m[0mglom[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return an RDD created by coalescing all elements within each partition
into a list.

>>> rdd = sc.parallelize([1, 2, 3, 4], 2)
>>> sorted(rdd.glom().collect())
[[1, 2], [3, 4]]
[0;31mFile:[0m      ~/anaconda/lib/python3.6/site-packages/pyspark/rdd.py
[0;31mType:[0m      method


In [67]:
# broadcast
supplementalData = {'Spark': 10000, "Definitive": 200,
                   "Big":-300, "Simple":100}
suppBroadcast = spark.sparkContext.broadcast(supplementalData)

In [68]:
suppBroadcast.value

{'Big': -300, 'Definitive': 200, 'Simple': 100, 'Spark': 10000}

In [115]:
words.map(lambda word: (word, suppBroadcast.value.get(word, 0)))\
    .sortBy(lambda wordPair: wordPair[1])\
    .collect()

[('Big', -300),
 ('The', 0),
 ('Guide', 0),
 (':', 0),
 ('Data', 0),
 ('Processing', 0),
 ('Made', 0),
 ('Simple', 100),
 ('Definitive', 200),
 ('Spark', 10000)]

In [91]:
# accumulator
data = spark.read.csv('./2010-12-01.csv', sep='\t', header=True, inferSchema=True)

In [98]:
from pyspark.sql.functions import lit
data = data.withColumn('count', lit(1))

In [99]:
data.show(2)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|count|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|    1|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|    1|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----+
only showing top 2 rows



In [100]:
# 定义一个accumulator
accum_variable = spark.sparkContext.accumulator(0)

In [101]:
# 计算Country为United Kingdom的个数
def func(data):
    col1 = data.Country
    if col1 == "United Kingdom":
        accum_variable.add(data['count'])

In [102]:
data.foreach(lambda row: func(row))

In [96]:
data.show(2)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows

