In [27]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("YourAppName")
    .config("spark.master", "local[*]")
    .config("spark.driver.extraClassPath", "/spark/jars/sqlite-jdbc-3.36.0.3.jar")
    .config("spark.jars", "/spark/jars/sqlite-jdbc-3.36.0.3.jar")
    .getOrCreate()
)

In [28]:
spark.range(10).rdd

MapPartitionsRDD[70] at javaToPython at NativeMethodAccessorImpl.java:0

In [29]:
spark.range(10).toDF("id").rdd.map(lambda row: row[0])

PythonRDD[77] at RDD at PythonRDD.scala:53

In [30]:
# 로컬 컬렉션으로 RDD 생성
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple".split(" ")
words = spark.sparkContext.parallelize(myCollection, 2)

In [31]:
words.setName("myWords")
words.name()

'myWords'

In [32]:
words.distinct().count()

10

In [33]:
# filter
# SQL의 WHERE과 비슷
# RDD의 레코드를 모두 확인하고 조건 함수를 만족하는 레코드 반환
def startsWithS(individual):
    return individual.startswith("S")

In [34]:
words.filter(lambda word: startsWithS(word)).collect()

['Spark', 'Simple']

In [35]:
# map
words2 = words.map(lambda word: (word, word[0], word.startswith("S")))

In [36]:
words2.filter(lambda record: record[2]).take(5)

[('Spark', 'S', True), ('Simple', 'S', True)]

In [37]:
# flatMap
# 단어를 문자 집합 [char]로 변환
words.flatMap(lambda word: list(word)).take(5)

['S', 'p', 'a', 'r', 'k']

In [38]:
# 단어 길이 내림차순 정렬
words.sortBy(lambda word: len(word) * -1).take(2)

['Definitive', 'Processing']

In [39]:
fiftyFiftySplit = words.randomSplit([0.5, 0.5])

In [40]:
print(type(fiftyFiftySplit))
print(fiftyFiftySplit[0])
print(fiftyFiftySplit[0].take(5))

<class 'list'>
PythonRDD[95] at RDD at PythonRDD.scala:53
['Spark', 'The', 'Definitive']


In [41]:
# reduce: RDD의 모든 값을 하나로 만듦
spark.sparkContext.parallelize(range(1, 21)).reduce(lambda x, y: x + y)

210

In [42]:
# 단어 집합에서 가장 긴 단어 찾는 예제 reduce 메서드로 구현
def wordLengthReducer(leftWord, rightWord):
    if len(leftWord) > len(rightWord):
        return leftWord
    else:
        return rightWord


words.reduce(wordLengthReducer)

'Processing'

In [43]:
words.count()

10

In [44]:
confidence = 0.95
timeoutMillseconds = 400
words.countApprox(timeoutMillseconds, confidence)

10

In [45]:
words.countApproxDistinct(0.05)

10

In [46]:
words.countByValue()

defaultdict(int,
            {'Spark': 1,
             'The': 1,
             'Definitive': 1,
             'Guide': 1,
             ':': 1,
             'Big': 1,
             'Data': 1,
             'Processing': 1,
             'Made': 1,
             'Simple': 1})

In [47]:
words.first()

'Spark'

### 파일 저장하기

In [50]:
words.saveAsTextFile("file:/tmp/bookTitle2")

In [51]:
df1 = spark.range(2, 1000000, 2)
df2 = spark.range(2, 1000000, 4)

step1 = df1.repartition(5)
step12 = df2.repartition(6)
step2 = step1.selectExpr("id * 5 as id")
step3 = step2.join(step12, ["id"])
step4 = step3.selectExpr("sum(id)")

step4.collect()

[Row(sum(id)=25000000000)]

In [52]:
step4.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   *(4) HashAggregate(keys=[], functions=[sum(id#40L)])
   +- ShuffleQueryStage 3
      +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=375]
         +- *(3) HashAggregate(keys=[], functions=[partial_sum(id#40L)])
            +- *(3) Project [id#40L]
               +- *(3) BroadcastHashJoin [id#40L], [id#34L], Inner, BuildRight, false
                  :- *(3) Project [(id#32L * 5) AS id#40L]
                  :  +- ShuffleQueryStage 0
                  :     +- Exchange RoundRobinPartitioning(5), REPARTITION_BY_NUM, [plan_id=262]
                  :        +- *(1) Range (2, 1000000, step=2, splits=10)
                  +- BroadcastQueryStage 2
                     +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=272]
                        +- ShuffleQueryStage 1
                           +- Exchange RoundRobinPartitioning(6), REPARTITION_BY_NUM, [plan_