In [None]:
"""
on colab
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"
"""


## with pyspark.SparkContext

In [1]:
import pyspark
import random


In [2]:
sc = pyspark.SparkContext(appName="test")

In [9]:
%%time
#sc = pyspark.SparkContext(appName="Pi")
num_samples = 100

def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1

count = sc.parallelize(range(0, num_samples)).filter(inside).count()

pi = 4 * count / num_samples
print(pi)

sc.stop()

3.24
Wall time: 3.62 s


### RDD Transformation
transfomation produce RDDs

In [3]:
#flatMap
rdd=sc.parallelize([2,3,4])
y=rdd.flatMap(lambda x: range(1,x))
y.collect()

[1, 1, 2, 1, 2, 3]

In [47]:
#map
rdd=sc.parallelize([2,3,4])
y=rdd.map(lambda x: x+1)
y.collect()

[3, 4, 5]

In [49]:
#filter
rdd=sc.parallelize([1,2,3,4,5,6,7])
y=rdd.filter(lambda x: x%2==1)
y.collect()

[1, 3, 5, 7]

In [56]:
#sample
rdd=sc.parallelize(range(5))
y=rdd.sample(withReplacement=True,fraction=0.5)
y.collect()

[2, 2]

In [58]:
#union
rdd1=sc.parallelize(range(9))
rdd2=sc.parallelize(range(14,19))
y=rdd1.union(rdd2)
y.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18]

In [61]:
#intersection
rdd1=sc.parallelize(range(9))
rdd2=sc.parallelize(range(4,19))
y=rdd1.intersection(rdd2)
y.collect()

[8, 4, 5, 6, 7]

In [63]:
#distinct
rdd1=sc.parallelize(range(9))
rdd2=sc.parallelize(range(4,12))
y=rdd1.union(rdd2).distinct()
y.collect()

[0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15]

In [67]:
#sortBy
rdd1=sc.parallelize([5,6,8,3,2,6,9,4])
rdd2=sc.parallelize(range(4,12))
y=rdd1.sortBy(lambda x: x, ascending=False,numPartitions=None)
y.collect()

[9, 8, 6, 6, 5, 4, 3, 2]

In [74]:
#sortBy
rdd1=sc.parallelize([("H",10),("A",26),("Z",1)])
y=rdd1.sortBy(lambda x: x[1], ascending=False,numPartitions=None)
y.collect()

[('A', 26), ('H', 10), ('Z', 1)]

In [87]:
#mapPartitions
rdd=sc.parallelize([1,2,3,4],2)
def f(x): yield sum(x)
y=rdd.mapPartitions(f)
y.collect()

[3, 7]

In [94]:
#mapPartitionsWithIndex
rdd=sc.parallelize([1,2,3,4],3)
def f(splitIndex, x): yield splitIndex
y=rdd.mapPartitionsWithIndex(f)
y.collect()

[0, 1, 2]

In [106]:
#groupBy
rdd=sc.parallelize([1,1,3,5,7,3,2,4])
res=rdd.groupBy(lambda x: x%3).collect()
[(x, sorted(y)) for (x, y) in res]

[(0, [3, 3]), (1, [1, 1, 4, 7]), (2, [2, 5])]

In [126]:
#zip
rdd1=sc.parallelize(range(1,5))
rdd2=sc.parallelize(range(101,105))
y=rdd1.zip(rdd2)
y.collect()

[(1, 101), (2, 102), (3, 103), (4, 104)]

In [130]:
#zipWithIndex
rdd1=sc.parallelize(["a","b","c","d"])
y=rdd1.zipWithIndex()
y.collect()

[('a', 0), ('b', 1), ('c', 2), ('d', 3)]

In [127]:
#keyBy and cogroup()
x=sc.parallelize(range(0,3)).keyBy(lambda x: x*x)
y=sc.parallelize(zip(range(5),range(6)))  #the extra list element is ignored in zip
[(x,list(map(list,y))) for x,y in sorted(x.cogroup(y).collect())]

[(0, [[0], [0]]),
 (1, [[1], [1]]),
 (2, [[], [2]]),
 (3, [[], [3]]),
 (4, [[2], [4]])]

In [139]:
#rePartition
rdd=sc.parallelize([1,2,3,4,5,6,7],4)
print(rdd.glom().collect())
print(rdd.repartition(2).glom().collect())

[[1], [2, 3], [4, 5], [6, 7]]
[[1, 4, 5, 6, 7], [2, 3]]


In [140]:
#coalesce
rdd=sc.parallelize([1,2,3,4,5,6,7],4)
print(rdd.glom().collect())
print(rdd.coalesce(2).glom().collect())

[[1], [2, 3], [4, 5], [6, 7]]
[[1, 2, 3], [4, 5, 6, 7]]


### RDD Action
Produce a value back to Spark driver program

In [4]:
from operator import add

In [5]:
#reduce
rdd=sc.parallelize([1,2,3,4,5])
print(rdd.reduce(add))

print(sc.parallelize(range(10)).map(lambda x: 1).reduce(add))

15
10


In [6]:
#first
rdd=sc.parallelize([2,3,4,5])
rdd.first()

2

In [7]:
#takeOrdered
rdd=sc.parallelize([2,7,6,3,4,5])
rdd.takeOrdered(4)

[2, 3, 4, 5]

In [8]:
#take
rdd=sc.parallelize([2,7,6,3,4,5])
rdd.take(4)

[2, 7, 6, 3]

In [9]:
#count
rdd=sc.parallelize([2,7,6,3,4,5])
rdd.count()

6

In [10]:
#collect
rdd=sc.parallelize([2,7,6,3,4,5])
rdd.collect()

[2, 7, 6, 3, 4, 5]

In [20]:
#saveAsTextFile
rdd=sc.parallelize([2,7,6,3,4,5])
#rdd.saveAsTextFile("data.txt")

In [27]:
#foreach
def f(x): print(x)
sc.parallelize([2,7,6,3,4,5]).foreach(f)

In [26]:
#foreachPartition
def f(iter):
    for x in iter:
        print(x)
sc.parallelize([2,7,6,3,4,5]).foreachPartition(f)

In [30]:
#min,max,mean,sum,variance,stdev
rdd=sc.parallelize(range(100))
rdd.stdev()

28.86607004772212

### RDD Functions

In [32]:
#countByValue
rdd=sc.parallelize([2,4,6,3,5,6,2,7,5,4,7,7,3.5,3.5])
rdd.countByValue()

defaultdict(int, {2: 2, 4: 2, 6: 2, 3: 1, 5: 2, 7: 3, 3.5: 2})

In [38]:
#toDebugString
rdd1=sc.parallelize(range(1,9),3)
rdd2=sc.parallelize(range(21,29),3)
rdd=rdd2.subtract(rdd1)
rdd.toDebugString()

b'(6) PythonRDD[89] at RDD at PythonRDD.scala:53 []\n |  MapPartitionsRDD[88] at mapPartitions at PythonRDD.scala:133 []\n |  ShuffledRDD[87] at partitionBy at <unknown>:0 []\n +-(6) PairwiseRDD[86] at subtract at <ipython-input-38-39c0a860c938>:4 []\n    |  PythonRDD[85] at subtract at <ipython-input-38-39c0a860c938>:4 []\n    |  UnionRDD[84] at union at <unknown>:0 []\n    |  PythonRDD[82] at RDD at PythonRDD.scala:53 []\n    |  ParallelCollectionRDD[81] at parallelize at PythonRDD.scala:195 []\n    |  PythonRDD[83] at RDD at PythonRDD.scala:53 []\n    |  ParallelCollectionRDD[80] at parallelize at PythonRDD.scala:195 []'

In [43]:
#create paired RDDs
rdd=sc.parallelize([("a1","b1","c1","d1"),("a2","b2","c2","d2")])
rdd.map(lambda x: (x[0],list(x[1:]))).collect()

[('a1', ['b1', 'c1', 'd1']), ('a2', ['b2', 'c2', 'd2'])]

### transformation on paired RDDs

In [None]:
# reduceByKey, groupByKey, mapValues,flatMapValues, keys, sortByKey, subtractByKey, 
# join, rightOuterJoin,leftOuterJoin, cogroup

In [None]:
#rdd Lineage

### word count

In [64]:
rdd=sc.textFile("pytest.txt")
nonempty_line=rdd.filter(lambda x: len(x)>0)
words=nonempty_line.flatMap(lambda x: x.split(' '))
y=words.map(lambda x: (x,1))
word_count=y.reduceByKey(add).map(lambda x: (x[1],x[0])).sortByKey(False)
for word in word_count.collect():
    print(word)

(2, 'is')
(2, 'pyspark')
(1, 'line')
(1, 'spark')
(1, 'with')
(1, 'python,')
(1, 'great!')
(1, 'fun.')
(1, '2')


## with pyspark.sql.SparkSession

In [65]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark create RDD example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [69]:
spark.catalog.listTables()

[]

In [70]:
path = 'Advertising.csv'

In [72]:
df = spark.read.csv(path=path,
                    sep=',',encoding='UTF-8',comment=None,
                    header=True,inferSchema=True)
df

DataFrame[_c0: int, TV: double, Radio: double, Newspaper: double, Sales: double]

In [73]:
df.show(5,True)
df.printSchema()

+---+-----+-----+---------+-----+
|_c0|   TV|Radio|Newspaper|Sales|
+---+-----+-----+---------+-----+
|  1|230.1| 37.8|     69.2| 22.1|
|  2| 44.5| 39.3|     45.1| 10.4|
|  3| 17.2| 45.9|     69.3|  9.3|
|  4|151.5| 41.3|     58.5| 18.5|
|  5|180.8| 10.8|     58.4| 12.9|
+---+-----+-----+---------+-----+
only showing top 5 rows

root
 |-- _c0: integer (nullable = true)
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)



In [48]:
sc