In [1]:
// Scala
val spark2 = spark // 책과는 다르게 
import spark2.implicits._

case class Flight(DEST_COUNTRY_NAME: String,
                  ORIGIN_COUNTRY_NAME: String,
                  count: BigInt)
val flightsDF = spark2.read
  .parquet("./data/flight-data/parquet/2010-summary.parquet/")
val flights = flightsDF.as[Flight]

spark2 = org.apache.spark.sql.SparkSession@8d2f28a
defined class Flight
flightsDF = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]
flights = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]

In [2]:
flights
    .filter(flight_row => flight_row.ORIGIN_COUNTRY_NAME != "Canada")
    .map(flight_row => flight_row)
    .take(5)

Array(Flight(United States,Romania,1), Flight(United States,Ireland,264), Flight(United States,India,69), Flight(Egypt,United States,24), Flight(Equatorial Guinea,United States,1))

In [3]:
flights
    .take(5)
    .filter(flight_row => flight_row.ORIGIN_COUNTRY_NAME != "Canada")
    .map(fr => Flight(fr.DEST_COUNTRY_NAME, fr.ORIGIN_COUNTRY_NAME, fr.count + 5))

Array(Flight(United States,Romania,6), Flight(United States,Ireland,269), Flight(United States,India,74), Flight(Egypt,United States,29), Flight(Equatorial Guinea,United States,6))

### 3.3 구조적 스트리밍

In [4]:
val staticDataFrame = spark2.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("./data/retail-data/by-day/*.csv")

staticDataFrame.createOrReplaceTempView("retail_data")
val staticSchema = staticDataFrame.schema

staticDataFrame = [InvoiceNo: string, StockCode: string ... 6 more fields]
staticSchema = StructType(StructField(InvoiceNo,StringType,true), StructField(StockCode,StringType,true), StructField(Description,StringType,true), StructField(Quantity,IntegerType,true), StructField(InvoiceDate,TimestampType,true), StructField(UnitPrice,DoubleType,true), StructField(CustomerID,DoubleType,true), StructField(Country,StringType,true))


StructType(StructField(InvoiceNo,StringType,true), StructField(StockCode,StringType,true), StructField(Description,StringType,true), StructField(Quantity,IntegerType,true), StructField(InvoiceDate,TimestampType,true), StructField(UnitPrice,DoubleType,true), StructField(CustomerID,DoubleType,true), StructField(Country,StringType,true))

In [5]:
import org.apache.spark.sql.functions.{window, col}

staticDataFrame
    .selectExpr(
        "CustomerId",
        "(UnitPrice * Quantity) as total_cost",
        "InvoiceDate")
    .groupBy(
        col("CustomerId"), window(col("InvoiceDate"), "1 day"))
    .sum("total_cost")
    .show()

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|   16057.0|[2011-12-05 00:00...|             -37.6|
|   14126.0|[2011-11-29 00:00...| 643.6300000000001|
|   13500.0|[2011-11-16 00:00...| 497.9700000000001|
|   17160.0|[2011-11-08 00:00...| 516.8499999999999|
|   15608.0|[2011-11-11 00:00...|             122.4|
|   15253.0|[2011-11-23 00:00...|             277.6|
|   15124.0|[2011-11-17 00:00...|             93.44|
|   12539.0|[2011-11-17 00:00...|           1050.66|
|   13658.0|[2011-11-30 00:00...| 542.4000000000001|
|   17396.0|[2011-10-31 00:00...|             495.0|
|   13576.0|[2011-11-10 00:00...| 543.3600000000001|
|   15111.0|[2011-11-10 00:00...|329.67999999999995|
|   17419.0|[2011-10-06 00:00...|465.54999999999995|
|   15749.0|[2011-04-18 00:00...|-1462.500000000001|
|   15769.0|[2011-04-18 00:00...|122.03999999999999|
|   18219.0|[2011-04-18 00:00...|            2

In [6]:
spark.conf.set("spark.sql.shuffle.partition", "5")

In [7]:
val streamingDataFrame = spark.readStream
    .schema(staticSchema)
    .option("maxFilesPerTrigger", 20)
    .format("csv")
    .option("headeer", "true")
    .load("./data/retail-data/by-day/*.csv")

streamingDataFrame = [InvoiceNo: string, StockCode: string ... 6 more fields]


[InvoiceNo: string, StockCode: string ... 6 more fields]

In [8]:
streamingDataFrame.isStreaming

true

In [9]:
val purchaseByCustomerPerHour = streamingDataFrame
    .selectExpr(
        "CustomerId",
        "(UnitPrice * Quantity) as total_cost",
        "InvoiceDate")
    .groupBy(
        col("CustomerId"), window(col("InvoiceDate"), "1 day"))
    .sum("total_cost")

purchaseByCustomerPerHour = [CustomerId: double, window: struct<start: timestamp, end: timestamp> ... 1 more field]


[CustomerId: double, window: struct<start: timestamp, end: timestamp> ... 1 more field]

In [10]:
purchaseByCustomerPerHour.writeStream
    .format("memory")
    .queryName("customer_purchases")
    .outputMode("complete")
    .start()

org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@5c037a0c

In [11]:
spark2.sql("""
    SELECT *
    FROM customer_purchases
    ORDER BY 'sum(total_cost)' DESC
    """)

[CustomerId: double, window: struct<start: timestamp, end: timestamp> ... 1 more field]

In [12]:
purchaseByCustomerPerHour.writeStream
    .format("console")
    .queryName("customer_purchases_2")
    .outputMode("complete")
    .start()

lastException: Throwable = null


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@1c155d12

-------------------------------------------
Batch: 0
-------------------------------------------
+----------+--------------------+-------------------+
|CustomerId|              window|    sum(total_cost)|
+----------+--------------------+-------------------+
|   12423.0|[2011-03-21 00:00...|             347.35|
|   16609.0|[2011-04-01 00:00...|  557.5299999999997|
|   13050.0|[2010-12-14 00:00...|             292.42|
|   17262.0|[2010-12-08 00:00...|             268.86|
|   15671.0|[2011-03-14 00:00...|             356.19|
|   17612.0|[2011-02-18 00:00...|              -5.95|
|   17894.0|[2010-12-08 00:00...|             106.77|
|   13092.0|[2010-12-15 00:00...|             185.45|
|   16814.0|[2011-01-18 00:00...| 193.09000000000003|
|   13758.0|[2010-12-08 00:00...|              356.4|
|      null|[2011-03-09 00:00...| 1692.9699999999993|
|   16162.0|[2011-04-01 00:00...|               37.4|
|   12649.0|[2010-12-09 00:00...|-19.799999999999997|
|   18118.0|[2011-02-07 00:00...|      

-------------------------------------------
Batch: 6
-------------------------------------------
+----------+--------------------+-------------------+
|CustomerId|              window|    sum(total_cost)|
+----------+--------------------+-------------------+
|   12423.0|[2011-03-21 00:00...|             347.35|
|   15290.0|[2011-03-29 00:00...|             280.85|
|   17731.0|[2011-09-21 00:00...|             276.37|
|   18287.0|[2011-05-22 00:00...|             765.28|
|   12720.0|[2011-05-04 00:00...|              154.4|
|   17460.0|[2010-12-01 00:00...|               19.9|
|   13657.0|[2011-05-22 00:00...| 152.33999999999997|
|   16255.0|[2011-07-31 00:00...|             297.32|
|   17795.0|[2011-03-20 00:00...| 163.24999999999997|
|   16700.0|[2011-05-05 00:00...|             378.24|
|      null|[2011-05-13 00:00...|  4864.120000000005|
|   14541.0|[2011-01-24 00:00...| 163.04000000000002|
|   13500.0|[2011-11-16 00:00...|  497.9700000000001|
|   15696.0|[2011-09-28 00:00...|      

-------------------------------------------
Batch: 12
-------------------------------------------
+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|   18287.0|[2011-05-22 00:00...|            765.28|
|   17368.0|[2011-09-07 00:00...|              20.8|
|   13003.0|[2011-05-09 00:00...|59.050000000000004|
|   12720.0|[2011-05-04 00:00...|             154.4|
|   17460.0|[2010-12-01 00:00...|              19.9|
|   13657.0|[2011-05-22 00:00...|152.33999999999997|
|   16255.0|[2011-07-31 00:00...|            297.32|
|   14527.0|[2011-05-17 00:00...|             -9.84|
|   17795.0|[2011-03-20 00:00...|163.24999999999997|
|   16700.0|[2011-05-05 00:00...|            378.24|
|      null|[2011-05-13 00:00...| 4864.120000000005|
|   14541.0|[2011-01-24 00:00...|163.04000000000002|
|   13500.0|[2011-11-16 00:00...| 497.9700000000001|
|   14688.0|[2011-10-18 00:00...|            205.17|
|