In [1]:
from pyspark.sql import SparkSession, Row
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster('local').setAppName('spark-sql-basic')
sc = SparkContext(conf = conf)

spark = SparkSession.builder.master("local").appName("spark-sql-basic").getOrCreate()

24/08/08 22:52:31 WARN Utils: Your hostname, MZC01-HYUCKSANGCHO.local resolves to a loopback address: 127.0.0.1; using 192.168.0.14 instead (on interface en0)
24/08/08 22:52:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/08 22:52:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### SQL Processing

In [2]:
acTransList = ["SB10001,1000", "SB10002,1200", "SB10003,8000", "SB10004,400", "SB10005,300", "SB10006,10000", "SB10007,500", "SB10008,56", "SB10009,30","SB10010,7000", "CR10001,7000", "SB10002,-10"]
acTransDF = sc.parallelize(acTransList).map(lambda trans : trans.split(',')).map(lambda t : Row(accNo=t[0], tranAmount=float(t[1]))).toDF()

                                                                                

In [3]:
acTransDF.show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|SB10001|    1000.0|
|SB10002|    1200.0|
|SB10003|    8000.0|
|SB10004|     400.0|
|SB10005|     300.0|
|SB10006|   10000.0|
|SB10007|     500.0|
|SB10008|      56.0|
|SB10009|      30.0|
|SB10010|    7000.0|
|CR10001|    7000.0|
|SB10002|     -10.0|
+-------+----------+



In [4]:
acTransDF.printSchema()

root
 |-- accNo: string (nullable = true)
 |-- tranAmount: double (nullable = true)



In [5]:
acTransDF.createOrReplaceTempView("trans")

In [6]:
goodTransRecords = spark.sql("SELECT accNo, tranAmount FROM trans WHERE accNo like 'SB%' AND tranAmount > 0")
goodTransRecords.show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|SB10001|    1000.0|
|SB10002|    1200.0|
|SB10003|    8000.0|
|SB10004|     400.0|
|SB10005|     300.0|
|SB10006|   10000.0|
|SB10007|     500.0|
|SB10008|      56.0|
|SB10009|      30.0|
|SB10010|    7000.0|
+-------+----------+



In [7]:
goodTransRecords.createOrReplaceTempView("goodtrans")

In [8]:
highValueTransRecords = spark.sql("SELECT accNo, tranAmount FROM goodtrans WHERE tranAmount > 1000")
highValueTransRecords.show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|SB10002|    1200.0|
|SB10003|    8000.0|
|SB10006|   10000.0|
|SB10010|    7000.0|
+-------+----------+



In [9]:
badAccountRecords = spark.sql("SELECT accNo, tranAmount FROM trans WHERE accNo NOT like 'SB%'")
badAccountRecords.show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|CR10001|    7000.0|
+-------+----------+



In [10]:
badAmountRecords = spark.sql("SELECT accNo, tranAmount FROM trans WHERE tranAmount < 0")
badAmountRecords.show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|SB10002|     -10.0|
+-------+----------+



In [11]:
badTransRecords = badAccountRecords.union(badAmountRecords)
badTransRecords.show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|CR10001|    7000.0|
|SB10002|     -10.0|
+-------+----------+



In [12]:
sumAmount = spark.sql("SELECT sum(tranAmount)as sum FROM goodtrans")
sumAmount.show()

+-------+
|    sum|
+-------+
|28486.0|
+-------+



In [13]:
maxAmount = spark.sql("SELECT max(tranAmount) as max FROM goodtrans")
maxAmount.show()

+-------+
|    max|
+-------+
|10000.0|
+-------+



In [14]:
minAmount = spark.sql("SELECT min(tranAmount)as min FROM goodtrans")
minAmount.show()

+----+
| min|
+----+
|30.0|
+----+



In [15]:
goodAccNos = spark.sql("SELECT DISTINCT accNo FROM trans WHERE accNo like 'SB%' ORDER BY accNo")
goodAccNos.show()

+-------+
|  accNo|
+-------+
|SB10001|
|SB10002|
|SB10003|
|SB10004|
|SB10005|
|SB10006|
|SB10007|
|SB10008|
|SB10009|
|SB10010|
+-------+



### DataFrame to RDD Processing

In [16]:
goodTransRecords.show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|SB10001|    1000.0|
|SB10002|    1200.0|
|SB10003|    8000.0|
|SB10004|     400.0|
|SB10005|     300.0|
|SB10006|   10000.0|
|SB10007|     500.0|
|SB10008|      56.0|
|SB10009|      30.0|
|SB10010|    7000.0|
+-------+----------+



In [17]:
sumAmountByMixing = goodTransRecords.rdd.map(lambda trans : trans.tranAmount).reduce(lambda a, b : a+b)
sumAmountByMixing

28486.0

In [20]:
maxAmountByMixing = goodTransRecords.rdd.map(lambda trans : trans.tranAmount).reduce(lambda a, b : a if a > b else b)
maxAmountByMixing

10000.0

In [22]:
minAmountByMixing = goodTransRecords.rdd.map(lambda trans : trans.tranAmount).reduce(lambda a, b : b if a>b else a)
minAmountByMixing

30.0

### DataFrame API

In [23]:
acTransDF.show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|SB10001|    1000.0|
|SB10002|    1200.0|
|SB10003|    8000.0|
|SB10004|     400.0|
|SB10005|     300.0|
|SB10006|   10000.0|
|SB10007|     500.0|
|SB10008|      56.0|
|SB10009|      30.0|
|SB10010|    7000.0|
|CR10001|    7000.0|
|SB10002|     -10.0|
+-------+----------+



In [24]:
acTransDF.printSchema()

root
 |-- accNo: string (nullable = true)
 |-- tranAmount: double (nullable = true)



In [29]:
goodTransRecords = acTransDF.filter("accNo like 'SB%'").filter("tranAmount>0")
goodTransRecords.show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|SB10001|    1000.0|
|SB10002|    1200.0|
|SB10003|    8000.0|
|SB10004|     400.0|
|SB10005|     300.0|
|SB10006|   10000.0|
|SB10007|     500.0|
|SB10008|      56.0|
|SB10009|      30.0|
|SB10010|    7000.0|
+-------+----------+



In [30]:
highValueTransRecords = goodTransRecords.filter("tranAmount > 1000")
highValueTransRecords.show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|SB10002|    1200.0|
|SB10003|    8000.0|
|SB10006|   10000.0|
|SB10010|    7000.0|
+-------+----------+



In [32]:
badAccountRecords = acTransDF.filter("accNo NOT like 'SB%'")
badAccountRecords.show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|CR10001|    7000.0|
+-------+----------+



In [33]:
badAmountRecords = acTransDF.filter("tranAmount<0")
badAmountRecords.show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|SB10002|     -10.0|
+-------+----------+



In [35]:
badAccountRecords.union(badAmountRecords).show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|CR10001|    7000.0|
|SB10002|     -10.0|
+-------+----------+



In [41]:
sumAmount = goodTransRecords.agg({"tranAmount":"sum"})
sumAmount.show()

+---------------+
|sum(tranAmount)|
+---------------+
|        28486.0|
+---------------+



In [43]:
goodTransRecords.agg({"tranAmount":"max"}).show()

+---------------+
|max(tranAmount)|
+---------------+
|        10000.0|
+---------------+



In [44]:
goodTransRecords.agg({"tranAmount":"min"}).show()

+---------------+
|min(tranAmount)|
+---------------+
|           30.0|
+---------------+



In [54]:
acTransDF.filter("accNo like 'SB%'").select("accNo").orderBy("accNo").show()

+-------+
|  accNo|
+-------+
|SB10001|
|SB10002|
|SB10002|
|SB10003|
|SB10004|
|SB10005|
|SB10006|
|SB10007|
|SB10008|
|SB10009|
|SB10010|
+-------+



In [55]:
acTransDF.write.parquet("python.trans.parquet")


                                                                                

In [56]:
acTransDFfromParquet = spark.read.parquet("python.trans.parquet")
acTransDFfromParquet.show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|SB10001|    1000.0|
|SB10002|    1200.0|
|SB10003|    8000.0|
|SB10004|     400.0|
|SB10005|     300.0|
|SB10006|   10000.0|
|SB10007|     500.0|
|SB10008|      56.0|
|SB10009|      30.0|
|SB10010|    7000.0|
|CR10001|    7000.0|
|SB10002|     -10.0|
+-------+----------+



### 스파크SQL 집계

In [64]:
from pyspark.sql import Row
acTransList = ["SB10001,1000", "SB10002,1200", "SB10001,8000","SB10002,400", "SB10003,300", "SB10001,10000","SB10004,500","SB10005,56","SB10003,30","SB10002,7000", "SB10001,-100","SB10002,-10"]
acTransDF = sc.parallelize(acTransList).map(lambda trans : trans.split(",")).map(lambda p : Row(accNo = p[0], tranAmount=float(p[1]))).toDF()
acTransDF.show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|SB10001|    1000.0|
|SB10002|    1200.0|
|SB10001|    8000.0|
|SB10002|     400.0|
|SB10003|     300.0|
|SB10001|   10000.0|
|SB10004|     500.0|
|SB10005|      56.0|
|SB10003|      30.0|
|SB10002|    7000.0|
|SB10001|    -100.0|
|SB10002|     -10.0|
+-------+----------+



In [65]:
acTransDF.createOrReplaceTempView("trans")

In [66]:
acSummary = spark.sql("SELECT accNo, sum(tranAmount) as transTotal FROM trans GROUP BY accNo")
acSummary.show()

+-------+----------+
|  accNo|transTotal|
+-------+----------+
|SB10005|      56.0|
|SB10004|     500.0|
|SB10003|     330.0|
|SB10002|    8590.0|
|SB10001|   18900.0|
+-------+----------+



In [74]:
acSummaryViaDFAPI = acTransDF.groupBy("accNo").agg({"tranAmount":"sum"}).selectExpr("accNo", "`sum(tranAmount)` as transTotal")
acSummaryViaDFAPI.show()

+-------+----------+
|  accNo|transTotal|
+-------+----------+
|SB10005|      56.0|
|SB10004|     500.0|
|SB10003|     330.0|
|SB10002|    8590.0|
|SB10001|   18900.0|
+-------+----------+



In [81]:
AcMaster = Row('accNo', 'firstName', 'lastName')
AcBal = Row('accNo', 'balanceAmount')

acMasterList = ["SB10001,Roger,Federer","SB10002,Pete,Sampras", "SB10003,Rafael,Nadal","SB10004,Boris,Becker", "SB10005,Ivan,Lendl"]
acBalList = ["SB10001,50000", "SB10002,12000","SB10003,3000", "SB10004,8500", "SB10005,5000"]


acMasterDF = sc.parallelize(acMasterList).map(lambda trans : trans.split(',')).map(lambda t : AcMaster(*t)).toDF()
acBalDF = sc.parallelize(acBalList).map(lambda trans: trans.split(",")).map(lambda r: AcBal(r[0], float(r[1]))).toDF()

In [82]:
acMasterDF.show()

+-------+---------+--------+
|  accNo|firstName|lastName|
+-------+---------+--------+
|SB10001|    Roger| Federer|
|SB10002|     Pete| Sampras|
|SB10003|   Rafael|   Nadal|
|SB10004|    Boris|  Becker|
|SB10005|     Ivan|   Lendl|
+-------+---------+--------+



In [83]:
acBalDF.show()

+-------+-------------+
|  accNo|balanceAmount|
+-------+-------------+
|SB10001|      50000.0|
|SB10002|      12000.0|
|SB10003|       3000.0|
|SB10004|       8500.0|
|SB10005|       5000.0|
+-------+-------------+



In [84]:
acMasterDF.write.parquet("python.master.parquet")
acBalDF.write.json("pythonMaster.json")


In [85]:
acMasterDFFromFile = spark.read.parquet("python.master.parquet")
acMasterDFFromFile.createOrReplaceTempView("master")

acBalDFFromFile = spark.read.json("pythonMaster.json")
acBalDFFromFile.createOrReplaceTempView("balance")

In [86]:
acMasterDFFromFile.show()

+-------+---------+--------+
|  accNo|firstName|lastName|
+-------+---------+--------+
|SB10001|    Roger| Federer|
|SB10002|     Pete| Sampras|
|SB10003|   Rafael|   Nadal|
|SB10004|    Boris|  Becker|
|SB10005|     Ivan|   Lendl|
+-------+---------+--------+



In [87]:
acBalDFFromFile.show()

+-------+-------------+
|  accNo|balanceAmount|
+-------+-------------+
|SB10001|      50000.0|
|SB10002|      12000.0|
|SB10003|       3000.0|
|SB10004|       8500.0|
|SB10005|       5000.0|
+-------+-------------+



In [88]:
acDetail = spark.sql("SELECT master.accNo, firstName, lastName, balanceAmount FROM master, balance WHERE master.accNo = balance.accNo ORDER BY balanceAmount DESC")

acDetail.show()

+-------+---------+--------+-------------+
|  accNo|firstName|lastName|balanceAmount|
+-------+---------+--------+-------------+
|SB10001|    Roger| Federer|      50000.0|
|SB10002|     Pete| Sampras|      12000.0|
|SB10004|    Boris|  Becker|       8500.0|
|SB10005|     Ivan|   Lendl|       5000.0|
|SB10003|   Rafael|   Nadal|       3000.0|
+-------+---------+--------+-------------+



In [89]:
acDetailFromAPI = acMasterDFFromFile.join(acBalDFFromFile, acMasterDFFromFile.accNo == acBalDFFromFile.accNo).sort(acBalDFFromFile.balanceAmount, ascending=False).select(acMasterDFFromFile.accNo, acMasterDFFromFile.firstName, acMasterDFFromFile.lastName, acBalDFFromFile.balanceAmount)

acDetailFromAPI.show()


+-------+---------+--------+-------------+
|  accNo|firstName|lastName|balanceAmount|
+-------+---------+--------+-------------+
|SB10001|    Roger| Federer|      50000.0|
|SB10002|     Pete| Sampras|      12000.0|
|SB10004|    Boris|  Becker|       8500.0|
|SB10005|     Ivan|   Lendl|       5000.0|
|SB10003|   Rafael|   Nadal|       3000.0|
+-------+---------+--------+-------------+



In [90]:
acDetailTop3 = spark.sql("SELECT master.accNo, firstName, lastName, balanceAmount FROM master, balance WHERE master.accNo = balance.accNo ORDER BY balanceAmount DESC").limit(3)

acDetailTop3.show()

+-------+---------+--------+-------------+
|  accNo|firstName|lastName|balanceAmount|
+-------+---------+--------+-------------+
|SB10001|    Roger| Federer|      50000.0|
|SB10002|     Pete| Sampras|      12000.0|
|SB10004|    Boris|  Becker|       8500.0|
+-------+---------+--------+-------------+



# Spark Catalog

In [92]:
catalog = spark.catalog
catalog.listDatabases()

[Database(name='default', catalog='spark_catalog', description='default database', locationUri='file:/Users/mzc01-hyucksangcho/Desktop/python/Spark/도서/초보자를%2520위한%2520아파치%2520스파크2/spark-warehouse')]

In [93]:
catalog.listTables()

[Table(name='balance', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='goodtrans', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='master', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='trans', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]