In [None]:
// Scala
val spark2 = spark // 책과는 다르게 
import spark2.implicits._

case class Flight(DEST_COUNTRY_NAME: String,
                  ORIGIN_COUNTRY_NAME: String,
                  count: BigInt)
val flightsDF = spark2.read
  .parquet("./data/flight-data/parquet/2010-summary.parquet/")
val flights = flightsDF.as[Flight]

In [None]:
flights
    .filter(flight_row => flight_row.ORIGIN_COUNTRY_NAME != "Canada")
    .map(flight_row => flight_row)
    .take(5)

In [None]:
flights
    .take(5)
    .filter(flight_row => flight_row.ORIGIN_COUNTRY_NAME != "Canada")
    .map(fr => Flight(fr.DEST_COUNTRY_NAME, fr.ORIGIN_COUNTRY_NAME, fr.count + 5))

### 3.3 구조적 스트리밍

In [None]:
val staticDataFrame = spark2.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("./data/retail-data/by-day/*.csv")

staticDataFrame.createOrReplaceTempView("retail_data")
val staticSchema = staticDataFrame.schema

In [None]:
import org.apache.spark.sql.functions.{window, col}

staticDataFrame
    .selectExpr(
        "CustomerId",
        "(UnitPrice * Quantity) as total_cost",
        "InvoiceDate")
    .groupBy(
        col("CustomerId"), window(col("InvoiceDate"), "1 day"))
    .sum("total_cost")
    .show()

In [None]:
spark.conf.set("spark.sql.shuffle.partition", "5")

In [None]:
val streamingDataFrame = spark.readStream
    .schema(staticSchema)
    .option("maxFilesPerTrigger", 20)
    .format("csv")
    .option("headeer", "true")
    .load("./data/retail-data/by-day/*.csv")

In [None]:
streamingDataFrame.isStreaming

In [None]:
val purchaseByCustomerPerHour = streamingDataFrame
    .selectExpr(
        "CustomerId",
        "(UnitPrice * Quantity) as total_cost",
        "InvoiceDate")
    .groupBy(
        col("CustomerId"), window(col("InvoiceDate"), "1 day"))
    .sum("total_cost")

In [None]:
purchaseByCustomerPerHour.writeStream
    .format("memory")
    .queryName("customer_purchases")
    .outputMode("complete")
    .start()

In [None]:
spark2.sql("""
    SELECT *
    FROM customer_purchases
    ORDER BY 'sum(total_cost)' DESC
    """)

In [None]:
purchaseByCustomerPerHour.writeStream
    .format("console")
    .queryName("customer_purchases_2")
    .outputMode("complete")
    .start()