In [87]:
from pyspark.sql import SparkSession
spark = (SparkSession
         .builder
         #.master("local[10]") # 이그제큐터 코어 갯수 설정 하기, 해당 코드는 1개의 이그제큐터에 10개의 코어를 할당
         .appName("Example")
         .getOrCreate())

캐싱과 영속화

In [88]:
from pyspark.sql.functions import col
import time

In [89]:
df = spark.range(1 * 10000000).toDF("id").withColumn("square", col("id") * col("id") )
df.show(10)

+---+------+
| id|square|
+---+------+
|  0|     0|
|  1|     1|
|  2|     4|
|  3|     9|
|  4|    16|
|  5|    25|
|  6|    36|
|  7|    49|
|  8|    64|
|  9|    81|
+---+------+
only showing top 10 rows



캐시 생성

In [90]:
df.cache() 
start = time.time()
df.count()
end = time.time()
print(end - start)

1.5043022632598877


In [91]:
start = time.time()
df.count()
end = time.time()
print(end - start)

0.11701583862304688


캐시에서 삭제

In [92]:
df.unpersist()

DataFrame[id: bigint, square: bigint]

영속화

In [93]:
from pyspark.storagelevel import StorageLevel

In [94]:
df = spark.range(1 * 10000000).toDF("id").withColumn("square", col("id") * col("id") *2 )

In [95]:
df.persist(StorageLevel.DISK_ONLY)
start = time.time()
df.count()
end = time.time()
print(end - start)

1.3100874423980713


In [96]:
start = time.time()
df.count()
end = time.time()
print(end - start)

0.13684463500976562


In [97]:
#df.unpersist()

셔플 소트 머지 조인

In [105]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Test App") \
    .master("local[*]") \
    .getOrCreate()

In [106]:
import random
import os
os.environ["PYSPARK_PYTHON"] = "C:/Users/jaehy/anaconda3/python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = "C:/Users/jaehy/anaconda3/python.exe"
# 아래의 설정으로 자동으로 SMJ가 시행됨.
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1") # 자동으로 SMJ 시행

# state, item column 생성을 위한 dict
states_dict = {0:"AZ", 1:"CO", 2:"CA", 3:"TX", 4:"NY", 5:"MI" }
items_dict = {0:"SKU-0", 1:"SKU-1", 2:"SKU-2", 3:"SKU-3", 4:"SKU-4", 5:"SKU-5"}


In [107]:
# usersDF 생성
usersDF = spark.range(1 * 10000000).rdd.map(lambda x: (str(x[0]),
                                                       "user_"+str(x[0]),
                                                       "user_"+str(x[0])+"@databricks.com",
                                                       states_dict[random.choice(range(6))])
                                            ).toDF(["uid", "login", "email", "user_state"])
#usersDF.show(6)

In [108]:
ordersDF = spark.range(1 * 10000000).rdd.map(lambda x: (x[0],
                                                        x[0],
                                                        random.choice(range(10001)),
                                                        10 * x[0] * 0.2,
                                                        states_dict[random.choice(range(6))],
                                                        items_dict[random.choice(range(6))])
                                             ).toDF(["transaction_id", "quantity", "users_id", "amount", "state", "items"])

In [109]:
usersOrdersDF = ordersDF.join(usersDF, ordersDF.users_id == usersDF.uid)

In [110]:
usersOrdersDF.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [users_id#1436L], [cast(uid#1424 as bigint)], Inner
   :- Sort [users_id#1436L ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(users_id#1436L, 200), ENSURE_REQUIREMENTS, [plan_id=1135]
   :     +- Filter isnotnull(users_id#1436L)
   :        +- Scan ExistingRDD[transaction_id#1434L,quantity#1435L,users_id#1436L,amount#1437,state#1438,items#1439]
   +- Sort [cast(uid#1424 as bigint) ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(cast(uid#1424 as bigint), 200), ENSURE_REQUIREMENTS, [plan_id=1136]
         +- Filter isnotnull(uid#1424)
            +- Scan ExistingRDD[uid#1424,login#1425,email#1426,user_state#1427]




In [113]:
spark.stop()