# Spark container 실행 방법

## 확인 사항
* volume path(docker-compose.yml)
    * mariadb
    * jupyter lab
* expose port(Dockerfile, docker-compose.yml)
    * 이미 사용하고 있는 port는 아닌지 확인

## docker-composer 실행 순서
* docker-composer - Dockerfile - scripts/entrypoint.sh
* 명령어
    * docker-compose up --build
    
## 주의 
* entrypoint.sh schema init 은 최소 1회만 실행 (최소 실행 후 주석처리)

# Spark Session

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import types as T

spark = (
    SparkSession
    .builder
    .appName("Last Chapter")
    .master("local[*]")
    .config("hive.metastore.uris", "thrift://0.0.0.0:9083")
    .enableHiveSupport()
    .getOrCreate()
)

In [2]:
spark.catalog.listDatabases()

[Database(name='default', description='Default Hive database', locationUri='file:/home/jovyan/work/spark-warehouse')]

# Structured Dataframe API

* ch04_data_transactions.txt
    * 구매 날짜, 시간, 고객 ID, 상품 ID, 구매 수량, 구매 금액

In [3]:
!head ../book-samples/ch04/ch04_data_transactions.txt 

2015-03-30#6:55 AM#51#68#1#9506.21
2015-03-30#7:39 PM#99#86#5#4107.59
2015-03-30#11:57 AM#79#58#7#2987.22
2015-03-30#12:46 AM#51#50#6#7501.89
2015-03-30#11:39 AM#86#24#5#8370.2
2015-03-30#10:35 AM#63#19#5#1023.57
2015-03-30#2:30 AM#23#77#7#5892.41
2015-03-30#7:41 PM#49#58#4#9298.18
2015-03-30#9:18 AM#97#86#8#9462.89
2015-03-30#10:06 PM#94#26#4#4199.15


In [4]:
transactions_schema = T.StructType([
    T.StructField("DATE", T.StringType(), True),
    T.StructField("TIME", T.StringType(), True),
    T.StructField("CUSTOMER_ID", T.StringType(), True),
    T.StructField("PRODUCT_ID", T.StringType(), True),
    T.StructField("QUANTITY", T.StringType(), True),
    T.StructField("AMOUNT", T.StringType(), True),
])

In [5]:
trans_df = spark.read.csv(
    "../book-samples/ch04/ch04_data_transactions.txt",
    sep="#",
    schema=transactions_schema
)

trans_df = trans_df.withColumn("DATE", F.to_date(F.col("DATE"), "yyyy-MM-dd"))
trans_df = trans_df.withColumn("DATETIME", F.concat(F.col("DATE"), F.lit(" "), F.col("TIME")))
trans_df = trans_df.withColumn("DATETIME", F.to_timestamp(F.col("DATETIME"), "yyyy-MM-dd H:mm a"))
print("the num of rows:", trans_df.count())

the num of rows: 1000


In [6]:
trans_df.show()

+----------+--------+-----------+----------+--------+-------+-------------------+
|      DATE|    TIME|CUSTOMER_ID|PRODUCT_ID|QUANTITY| AMOUNT|           DATETIME|
+----------+--------+-----------+----------+--------+-------+-------------------+
|2015-03-30| 6:55 AM|         51|        68|       1|9506.21|2015-03-30 06:55:00|
|2015-03-30| 7:39 PM|         99|        86|       5|4107.59|               null|
|2015-03-30|11:57 AM|         79|        58|       7|2987.22|2015-03-30 11:57:00|
|2015-03-30|12:46 AM|         51|        50|       6|7501.89|               null|
|2015-03-30|11:39 AM|         86|        24|       5| 8370.2|2015-03-30 11:39:00|
|2015-03-30|10:35 AM|         63|        19|       5|1023.57|2015-03-30 10:35:00|
|2015-03-30| 2:30 AM|         23|        77|       7|5892.41|2015-03-30 02:30:00|
|2015-03-30| 7:41 PM|         49|        58|       4|9298.18|               null|
|2015-03-30| 9:18 AM|         97|        86|       8|9462.89|2015-03-30 09:18:00|
|2015-03-30|10:0

* ch04_data_product.txt
    * 구매 날짜, 시간, 고객 ID, 상품 ID, 구매 수량, 구매 금액

In [7]:
!head ../book-samples/ch04/ch04_data_products.txt

1#ROBITUSSIN PEAK COLD NIGHTTIME COLD PLUS FLU#9721.89#10
2#Mattel Little Mommy Doctor Doll#6060.78#6
3#Cute baby doll, battery#1808.79#2
4#Bear doll#51.06#6
5#LEGO Legends of Chima#849.36#6
6#LEGO Castle#4777.51#10
7#LEGO Mixels#8720.91#1
8#LEGO Star Wars#7592.44#4
9#LEGO Lord of the Rings#851.67#2
10#LEGO The Hobbit#7314.55#9


In [8]:
prod_schema = T.StructType([
    T.StructField("PRODUCT_ID", T.StringType(), True),
    T.StructField("PRODUCT_NAME", T.StringType(), True),
    T.StructField("PRICE", T.StringType(), True),
    T.StructField("INDEX", T.StringType(), True),
])

In [9]:
prod_df = spark.read.csv(
    "../book-samples/ch04/ch04_data_products.txt",
    sep="#",
    schema=prod_schema
)

prod_df.show(25)

+----------+--------------------+-------+-----+
|PRODUCT_ID|        PRODUCT_NAME|  PRICE|INDEX|
+----------+--------------------+-------+-----+
|         1|ROBITUSSIN PEAK C...|9721.89|   10|
|         2|Mattel Little Mom...|6060.78|    6|
|         3|Cute baby doll, b...|1808.79|    2|
|         4|           Bear doll|  51.06|    6|
|         5|LEGO Legends of C...| 849.36|    6|
|         6|         LEGO Castle|4777.51|   10|
|         7|         LEGO Mixels|8720.91|    1|
|         8|      LEGO Star Wars|7592.44|    4|
|         9|LEGO Lord of the ...| 851.67|    2|
|        10|     LEGO The Hobbit|7314.55|    9|
|        11|      LEGO Minecraft|5646.81|    3|
|        12|   LEGO Hero Factory| 6911.2|    1|
|        13|   LEGO Architecture| 604.58|    5|
|        14|        LEGO Technic|7423.48|    3|
|        15|LEGO Storage & Ac...|3125.96|    2|
|        16|        LEGO Classic| 9933.3|   10|
|        17|   LEGO Galaxy Squad|5593.16|    4|
|        18|     LEGO Mindstorms|6022.88

# 데이터 분석
* 구매 횟수가 가장 많은 고객
* 바비 놀이세트(ID 25) 를 2개 이상 구매한 경우 5% 할인
* 사전을 다섯 권 이상 구매한 고객
* 가장 많은 금액을 지출한 고객
* 어제 판매한 상품 이름과 각 상품별 매출액 합계
* 어제 판매하지 않은 상품 목록
* 전일 판매 실적 통계: 고객별 평균, 최저 가격, 최고 가격, 구매 금액 합계

* 구매 횟수가 가장 많은 고객

In [10]:
(
    trans_df.groupby("CUSTOMER_ID")
    .count()
    .sort(F.desc("count"))
).show()

+-----------+-----+
|CUSTOMER_ID|count|
+-----------+-----+
|         53|   19|
|         51|   18|
|         56|   17|
|          2|   15|
|         76|   15|
|         31|   14|
|         50|   14|
|         32|   14|
|         34|   14|
|         21|   13|
|         79|   13|
|         91|   13|
|         23|   13|
|         47|   13|
|          3|   13|
|         82|   13|
|         58|   13|
|         55|   13|
|         17|   13|
|         41|   12|
+-----------+-----+
only showing top 20 rows



* 바비 놀이세트(ID 25) 를 2개 이상 구매한 경우 5% 할인

In [11]:
discount_condition = (F.col("QUANTITY") > 1) & (F.col("PRODUCT_ID") == 25)

trans_df = trans_df.withColumn("DISCOUNT", F.when(discount_condition, F.col("AMOUNT")*0.95).otherwise(F.col("AMOUNT")))
trans_df.where(F.col("PRODUCT_ID") == 25).show()

+----------+--------+-----------+----------+--------+-------+-------------------+-----------------+
|      DATE|    TIME|CUSTOMER_ID|PRODUCT_ID|QUANTITY| AMOUNT|           DATETIME|         DISCOUNT|
+----------+--------+-----------+----------+--------+-------+-------------------+-----------------+
|2015-03-30| 5:55 AM|         25|        25|       1|5089.02|2015-03-30 05:55:00|          5089.02|
|2015-03-30| 6:26 PM|         17|        25|       6|7193.11|               null|6833.454499999999|
|2015-03-30| 7:27 AM|         93|        25|       7|2749.15|2015-03-30 07:27:00|        2611.6925|
|2015-03-30| 5:34 AM|        100|        25|       1|7520.96|2015-03-30 05:34:00|          7520.96|
|2015-03-30| 1:07 AM|         68|        25|       9|8391.61|2015-03-30 01:07:00|7972.029500000001|
|2015-03-30| 1:23 AM|         59|        25|       5|5296.69|2015-03-30 01:23:00|        5031.8555|
|2015-03-30| 9:45 AM|         42|        25|      10|1363.97|2015-03-30 09:45:00|        1295.7715|


* 사전을 다섯 권 이상 구매한 고객

In [12]:
prod_df.where(F.lower(F.col("PRODUCT_NAME")).like("%dic%")).show() # ID는 81번

+----------+------------+-----+-----+
|PRODUCT_ID|PRODUCT_NAME|PRICE|INDEX|
+----------+------------+-----+-----+
|        81|  Dictionary|29.65|    4|
+----------+------------+-----+-----+



In [13]:
(
    trans_df.join(prod_df, "PRODUCT_ID", "left")
    .where(F.col("PRODUCT_NAME") == "Dictionary")
    .where(F.col("QUANTITY") >= 5)
    .select("CUSTOMER_ID")
    .distinct()
).show()

+-----------+
|CUSTOMER_ID|
+-----------+
|         85|
|         16|
|         47|
|         77|
|         82|
|         10|
+-----------+



* 가장 많은 금액을 지불한 고객

In [14]:
(
    trans_df.groupby("CUSTOMER_ID")
    .agg(
        F.sum(F.col("AMOUNT")).alias("AMOUNT_SUM"),
        F.last(F.col("AMOUNT")).alias("AMOUNT_LAST")
    )
    .sort(F.desc("AMOUNT_SUM"))
    .select(F.first("CUSTOMER_ID"), F.max("AMOUNT_SUM"))
).show()

+------------------+------------------+
|first(CUSTOMER_ID)|   max(AMOUNT_SUM)|
+------------------+------------------+
|                76|100049.00000000001|
+------------------+------------------+



* 어제 판매한 상품 이름과 각 상품별 매출액 합계

In [15]:
(
    trans_df.join(prod_df, "PRODUCT_ID", "left")
).count()

1000

In [16]:
(
    trans_df.join(prod_df, "PRODUCT_ID", "left")
    .groupby("PRODUCT_NAME")
    .agg(
        F.sum("AMOUNT").alias("AMOUNT")
    )
).show()

+--------------------+------------------+
|        PRODUCT_NAME|            AMOUNT|
+--------------------+------------------+
|          Gabapentin|          51227.19|
|    LEGO Minifigures| 46486.00000000001|
|   CUPRUM METALLICUM|          34805.73|
|      LEGO Minecraft| 64525.33999999999|
|ATOPALM MUSCLE AN...|          31049.07|
|Far Cry 4 Limited...| 82055.45999999999|
|healthy accents s...|          46411.33|
|Brimonidine Tartrate|59283.189999999995|
|Treatment Set TS3...|          40571.76|
|AMBROSIA TRIFIDA ...|          48601.89|
|Star Wars Republi...|          41679.19|
|Essentials Dantes...| 37302.96000000001|
|           Acyclovir|          26047.72|
|              Ativan|          31498.84|
|              Grippe|25839.770000000004|
|     LEGO The Hobbit|47085.590000000004|
| LEGO Jurassic World|            947.76|
|Santalia Clinical...|          49245.05|
|           Alphanate|           50917.7|
|Essentials Medal ...|20736.739999999998|
+--------------------+------------

* 어제 판매하지 않은 상품 목록

In [17]:
(
    prod_df.join(trans_df, "PRODUCT_ID", "leftanti")
).count()

4

In [19]:
trans_df.where(F.col("PRODUCT_ID") == 20).show()

+----+----+-----------+----------+--------+------+--------+--------+
|DATE|TIME|CUSTOMER_ID|PRODUCT_ID|QUANTITY|AMOUNT|DATETIME|DISCOUNT|
+----+----+-----------+----------+--------+------+--------+--------+
+----+----+-----------+----------+--------+------+--------+--------+



* 판매실적 통계

In [27]:
(
    trans_df.join(prod_df, "PRODUCT_ID", "left")
    .groupby("CUSTOMER_ID", "PRODUCT_NAME")
    .agg(
        F.last(F.col("PRICE")).alias("PRICE"),
        F.sum(F.col("AMOUNT")).alias("AMOUNT_BY_PRODUCT"),
    )
    .groupby("CUSTOMER_ID")
    .agg(
        F.avg("PRICE"),
        F.max("PRICE"),
        F.min("PRICE"),
        F.sum("AMOUNT_BY_PRODUCT")
    )
).show(10, False)

+-----------+------------------+----------+----------+----------------------+
|CUSTOMER_ID|avg(PRICE)        |max(PRICE)|min(PRICE)|sum(AMOUNT_BY_PRODUCT)|
+-----------+------------------+----------+----------+----------------------+
|51         |4973.745          |9721.89   |1305.04   |83312.11999999998     |
|7          |4592.275          |8720.91   |3003.77   |50079.82              |
|15         |4067.605555555556 |9933.3    |2531.15   |55853.28              |
|54         |5975.045714285714 |8875.2    |2626.88   |36307.04000000001     |
|11         |5729.09875        |8693.64   |2531.15   |37276.88              |
|29         |5384.812222222222 |8486.42   |2531.15   |31389.32              |
|69         |5150.442857142858 |7443.91   |2509.1    |26220.12              |
|42         |2719.5399999999995|711.88    |3003.77   |30491.92              |
|73         |5818.978571428571 |7314.55   |4171.55   |31892.579999999998    |
|87         |4304.635          |7907.21   |1368.53   |52329.6000

In [56]:
from pyspark.sql.window import Window

win_spec  = Window.partitionBy("CUSTOMER_ID").orderBy(F.desc("PRICE")).rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

(
    trans_df.join(prod_df, "PRODUCT_ID", "left")
    .withColumn("MAX_PRICE_PROD", F.max("PRICE").over(win_spec))
    .withColumn("MIN_PRICE_PROD", F.min("PRICE").over(win_spec))
    .where(
        (F.col("PRICE") == F.col("MAX_PRICE_PROD")) | (F.col("PRICE") == F.col("MIN_PRICE_PROD"))
    )
    .sort("CUSTOMER_ID")
).show(100, False)

+----------+----------+--------+-----------+--------+-------+-------------------+--------+--------------------------------------------+-------+-----+--------------+--------------+
|PRODUCT_ID|DATE      |TIME    |CUSTOMER_ID|QUANTITY|AMOUNT |DATETIME           |DISCOUNT|PRODUCT_NAME                                |PRICE  |INDEX|MAX_PRICE_PROD|MIN_PRICE_PROD|
+----------+----------+--------+-----------+--------+-------+-------------------+--------+--------------------------------------------+-------+-----+--------------+--------------+
|57        |2015-03-30|12:12 PM|1          |2       |3614.79|2015-03-30 12:12:00|3614.79 |Notebook Lenovo U430p, 59-390459            |2626.88|2    |9721.89       |2626.88       |
|1         |2015-03-30|6:52 PM |1          |1       |8832.77|null               |8832.77 |ROBITUSSIN PEAK COLD NIGHTTIME COLD PLUS FLU|9721.89|10   |9721.89       |2626.88       |
|81        |2015-03-30|2:54 PM |10         |10      |9897.61|null               |9897.61 |Dictionary

In [41]:
trans_df.groupby("CUSTOMER_ID").count().show()

+-----------+-----+
|CUSTOMER_ID|count|
+-----------+-----+
|         51|   18|
|          7|   10|
|         54|    7|
|         15|   10|
|         11|    8|
|         69|    7|
|         29|    9|
|         42|    7|
|         87|   10|
|         73|    7|
|         64|   10|
|          3|   13|
|         30|    5|
|         34|   14|
|         59|    9|
|          8|   10|
|         28|   11|
|         22|   10|
|         85|    9|
|         35|   10|
+-----------+-----+
only showing top 20 rows

