In [1]:
import pyspark 
from pyspark.sql import SparkSession

In [2]:
spark:SparkSession = SparkSession.builder \
      .appName("RDD test") \
      .getOrCreate()   

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/02 16:21:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


**Create RDD**

In [4]:
#Create RDD by paralelize function
data = [1,2,3,4,5,6,7,8,9,10,11,12]
rdd=spark.sparkContext.parallelize(data)
rdd.collect()

                                                                                

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [47]:
#Create RDD from local sources
rdd2 = spark.sparkContext.textFile("file:///home/iloveu/BK_bat_diet/20212/BigData/spark_test/test.txt")
rdd2.collect()

                                                                                

['Project Gutenberg’s',
 'Alice’s Adventures in Wonderland',
 'by Lewis Carroll',
 'This eBook is for the use',
 'of anyone anywhere',
 'at no cost and with',
 'Alice’s Adventures in Wonderland',
 'by Lewis Carroll',
 'This eBook is for the use',
 'of anyone anywhere no no no']

**Test 1 so Transformation tren rdd2**

In [48]:

#Xem lai cac buoc lap trinh spark 
#b1: Tao RDD (nhu tren)
#b2: Lazily transform thành các RDD trung gian
#b3: cache() RDD để tái sử dụng
#b4: Gọi các actions để thực thi tính toán song song và nhận về các kết quả

#filter transformation
filteredRDD = rdd2.filter(lambda line: "Alice" in line)
print("FilteredRDD")
print(filteredRDD.collect())
#

FilteredRDD


[Stage 62:>                                                         (0 + 0) / 2]

['Alice’s Adventures in Wonderland', 'Alice’s Adventures in Wonderland']


                                                                                

In [49]:
#FlatMap transformation
flatmapedRDD = rdd2.flatMap(lambda line: line.split(" "))
print("flatmapedRDD")
print(flatmapedRDD.collect())

flatmapedRDD


[Stage 63:>                                                         (0 + 0) / 2]

['Project', 'Gutenberg’s', 'Alice’s', 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'Alice’s', 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'no', 'no', 'no']


                                                                                

In [50]:
#map transformation
mapedRDD = flatmapedRDD.map(lambda line: (line,1))
print("mapedRDD")
print(mapedRDD.collect())

mapedRDD


[Stage 64:>                                                         (0 + 2) / 2]

[('Project', 1), ('Gutenberg’s', 1), ('Alice’s', 1), ('Adventures', 1), ('in', 1), ('Wonderland', 1), ('by', 1), ('Lewis', 1), ('Carroll', 1), ('This', 1), ('eBook', 1), ('is', 1), ('for', 1), ('the', 1), ('use', 1), ('of', 1), ('anyone', 1), ('anywhere', 1), ('at', 1), ('no', 1), ('cost', 1), ('and', 1), ('with', 1), ('Alice’s', 1), ('Adventures', 1), ('in', 1), ('Wonderland', 1), ('by', 1), ('Lewis', 1), ('Carroll', 1), ('This', 1), ('eBook', 1), ('is', 1), ('for', 1), ('the', 1), ('use', 1), ('of', 1), ('anyone', 1), ('anywhere', 1), ('no', 1), ('no', 1), ('no', 1)]


                                                                                

In [51]:
#reduceByKey transformation - Den day la dem dc so lan xuat hien tu trong van ban
reducedByKeyRDD = mapedRDD.reduceByKey(lambda a, b: a+ b)
print("reducedByKeyRDD")
print(reducedByKeyRDD.collect())

reducedByKeyRDD
[('Project', 1), ('Gutenberg’s', 1), ('Alice’s', 2), ('in', 2), ('Lewis', 2), ('Carroll', 2), ('is', 2), ('use', 2), ('of', 2), ('anyone', 2), ('anywhere', 2), ('at', 1), ('no', 4), ('Adventures', 2), ('Wonderland', 2), ('by', 2), ('This', 2), ('eBook', 2), ('for', 2), ('the', 2), ('cost', 1), ('and', 1), ('with', 1)]


                                                                                

In [52]:
#sortByKey transformation - Sap xep cac tu theo tan xuat suat hien tang dan
sortedByKeyRDD = reducedByKeyRDD.map(lambda x: (x[1], x[0])).sortByKey()
print("sortedByKeyRDD")
print(sortedByKeyRDD.collect())

sortedByKeyRDD


[Stage 72:>                                                         (0 + 2) / 2]

[(1, 'Project'), (1, 'Gutenberg’s'), (1, 'at'), (1, 'cost'), (1, 'and'), (1, 'with'), (2, 'Alice’s'), (2, 'in'), (2, 'Lewis'), (2, 'Carroll'), (2, 'is'), (2, 'use'), (2, 'of'), (2, 'anyone'), (2, 'anywhere'), (2, 'Adventures'), (2, 'Wonderland'), (2, 'by'), (2, 'This'), (2, 'eBook'), (2, 'for'), (2, 'the'), (4, 'no')]


                                                                                

**Test 1 so Action tren cac RDD vua tim dc**

In [53]:
# Action - count
print("Count number of record: "+str(sortedByKeyRDD.count()))

Count number of record: 23


In [54]:
# Action - first
firstRec = sortedByKeyRDD.first()
print("First Record : "+str(firstRec[0]) + ", "+ firstRec[1])

First Record : 1, Project


In [55]:
# Action - max (theo key neu la dict)
datMax = sortedByKeyRDD.max()
print("Max Record : "+str(datMax[0]) + ","+ datMax[1])

Max Record : 4,no


In [57]:
# Action - reduce RDD thanh 1 record duy nhat, co the dung de dem so luong, o day la so tu cua vb
totalWordCount = sortedByKeyRDD.reduce(lambda a,b: (a[0]+b[0],a[1]))
print(totalWordCount)
print("dataReduce Record : "+str(totalWordCount[0]))

(42, 'Project')
dataReduce Record : 42


In [62]:
# Action - take n first record of RDD
data3 = sortedByKeyRDD.take(3)
for f in data3:
    print("data3 Key:"+ str(f[0]) +", Value:"+f[1])

<class 'list'>
data3 Key:1, Value:Project
data3 Key:1, Value:Gutenberg’s
data3 Key:1, Value:at


In [61]:

# Action - collect Dung nay h Chuyen data RDD thanh array(java) list(python) - Co the bi tran bo nho neu data qua lon
data = sortedByKeyRDD.collect()
print(type(data))
for f in data:
    print("Key:"+ str(f[0]) +", Value:"+f[1])


<class 'list'>
Key:1, Value:Project
Key:1, Value:Gutenberg’s
Key:1, Value:at
Key:1, Value:cost
Key:1, Value:and
Key:1, Value:with
Key:2, Value:Alice’s
Key:2, Value:in
Key:2, Value:Lewis
Key:2, Value:Carroll
Key:2, Value:is
Key:2, Value:use
Key:2, Value:of
Key:2, Value:anyone
Key:2, Value:anywhere
Key:2, Value:Adventures
Key:2, Value:Wonderland
Key:2, Value:by
Key:2, Value:This
Key:2, Value:eBook
Key:2, Value:for
Key:2, Value:the
Key:4, Value:no


In [68]:
#Cuoi cung la action save file - No se tao ca cai folder luon 
sortedByKeyRDD.saveAsTextFile("file:///home/iloveu/BK_bat_diet/20212/BigData/spark_test/wordscount")

                                                                                

**RDD Cache**

In [72]:
#PySpark RDD cache() method by default saves RDD computation to storage level `MEMORY_ONLY` 
#meaning it will store the data in the JVM heap as unserialized objects.
cachedRDD = rdd2.cache()
cachedRDD

file:///home/iloveu/BK_bat_diet/20212/BigData/spark_test/test.txt MapPartitionsRDD[66] at textFile at NativeMethodAccessorImpl.java:0