In [1]:
spark.stop()

25/05/18 07:46:58 INFO  SparkContext:60 SparkContext is stopping with exitCode 0.
25/05/18 07:46:58 INFO  SparkUI:60 Stopped Spark web UI at http://163d156376d2:4040
25/05/18 07:46:58 INFO  StandaloneSchedulerBackend:60 Shutting down all executors
25/05/18 07:46:58 INFO  StandaloneSchedulerBackend$StandaloneDriverEndpoint:60 Asking each executor to shut down
25/05/18 07:46:58 INFO  MapOutputTrackerMasterEndpoint:60 MapOutputTrackerMasterEndpoint stopped!
25/05/18 07:46:58 INFO  MemoryStore:60 MemoryStore cleared
25/05/18 07:46:58 INFO  BlockManager:60 BlockManager stopped
25/05/18 07:46:58 INFO  BlockManagerMaster:60 BlockManagerMaster stopped
25/05/18 07:46:58 INFO  OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:60 OutputCommitCoordinator stopped!
25/05/18 07:46:58 INFO  SparkContext:60 Successfully stopped SparkContext


In [2]:
from pyspark.sql.functions import explode, split, trim, lower, expr
from pyspark.sql import SparkSession, DataFrame
from pyspark import SparkConf
from pyspark import SparkContext
import logging
from os.path import abspath
from pathlib import Path
import shutil
from pathlib import Path
from typing import Optional, Union, List, Tuple, Any

In [3]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%y-%m-%d %H:%M:%S",
    level=logging.DEBUG
)

In [4]:
class InvoiceStreamBatch:
    def __init__(
        self,
        spark: SparkSession
    ):
        self.spark = spark

    def read_invoices(
        self,
        format: str,
        path: Union[str, Path],
        schema: Union[str, Any]
    ) -> DataFrame:
        if isinstance(path, str):
            path = Path(path).as_posix()
        return (self.spark.readStream
                .format(format)
                .schema(schema)
                .load(path))

    def explode_invoices(self, df: DataFrame) -> DataFrame:
        return (
            df.selectExpr(
                "InvoiceNumber",
                "CreatedTime",
                "StoreID",
                "PosID",
                "CustomerType",
                "PaymentMethod",
                "DeliveryType",
                "DeliveryAddress.City",
                "DeliveryAddress.PinCode",
                "DeliveryAddress.State",
                "explode(InvoiceLineItems) AS LineItem"
            )
        )

    def flatten_invoices(self, df: DataFrame) -> DataFrame:
        return (
            df.withColumn("ItemCode", expr("LineItem.ItemCode"))
            .withColumn("ItemDescription", expr("LineItem.ItemDescription"))
            .withColumn("ItemPrice", expr("LineItem.ItemPrice"))
            .withColumn("ItemQty", expr("LineItem.ItemQty"))
            .withColumn("TotalValue", expr("LineItem.TotalValue"))
            .drop("LineItem")
        )

    def write_invoices(
        self,
        df: DataFrame,
        format: str,
        checkpoint_location: str,
        output_mode: str,
        table: str,
        trigger: str,
        max_files_per_trigger: int = 1
    ):
        squery = (
            df.writeStream
            .format(format)
            .option("checkpointLocation", checkpoint_location)
            .outputMode(output_mode)
            .option("maxFilesPerTrigger", max_files_per_trigger)
        )
        if trigger == "batch":
            return squery.trigger(availableNow=True).toTable(table)
        else:
            return squery.trigger(processingTime=trigger).toTable(table)

In [5]:
if __name__ == "__main__":
    table_name = "invoice_line_items"
    schema = """
        InvoiceNumber string,
        CreatedTime bigint,
        StoreID string,
        PosID string,
        CashierID string,
        CustomerType string,
        CustomerCardNo string,
        TotalAmount double,
        NumberOfItems bigint,
        PaymentMethod string,
        TaxableAmount double,
        CGST double,
        SGST double,
        CESS double,
        DeliveryType string,
        DeliveryAddress struct<
            AddressLine string,
            City string,
            ContactNumber string,
            PinCode string,
            State string
        >,
        InvoiceLineItems array<
            struct<
                ItemCode string,
                ItemDescription string,
                ItemPrice double,
                ItemQty bigint,
                TotalValue double
            >
        >
    """
    spark = (
        SparkSession.builder
        .appName("InvoicesStream")
        .enableHiveSupport()
        .getOrCreate()
    )
    invoices_stream = InvoiceStreamBatch(spark)
    invoices_df = invoices_stream.read_invoices(
        format="json",
        path="/opt/spark/datasets/invoices/*.json",
        schema=schema
    )
    exploded_df = invoices_stream.explode_invoices(invoices_df)
    flatten_df = invoices_stream.flatten_invoices(exploded_df)
    squery = invoices_stream.write_invoices(
        df=flatten_df,
        format="delta",
        checkpoint_location="/opt/spark/datasets/checkpoint/invoices",
        output_mode="append",
        table=table_name,
        trigger="batch"
    )

25-05-18 07:47:47 - DEBUG - Command to send: r
u
SparkConf
rj
e

25-05-18 07:47:47 - DEBUG - Answer received: !ycorg.apache.spark.SparkConf
25-05-18 07:47:47 - DEBUG - Command to send: i
org.apache.spark.SparkConf
bTrue
e

25-05-18 07:47:47 - DEBUG - Answer received: !yro48
25-05-18 07:47:47 - DEBUG - Command to send: c
o48
set
sspark.app.name
sInvoicesStream
e

25-05-18 07:47:47 - DEBUG - Answer received: !yro49
25-05-18 07:47:47 - DEBUG - Command to send: c
o48
set
sspark.sql.catalogImplementation
shive
e

25-05-18 07:47:47 - DEBUG - Answer received: !yro50
25-05-18 07:47:47 - DEBUG - Command to send: c
o48
get
sspark.executor.allowSparkContext
sfalse
e

25-05-18 07:47:47 - DEBUG - Answer received: !ysfalse
25-05-18 07:47:47 - DEBUG - Command to send: c
o48
contains
sspark.serializer.objectStreamReset
e

25-05-18 07:47:47 - DEBUG - Answer received: !ybfalse
25-05-18 07:47:47 - DEBUG - Command to send: c
o48
set
sspark.serializer.objectStreamReset
s100
e

25-05-18 07:47:47 - DEBUG - A

In [6]:
spark.read.table(table_name).show()

25-05-18 07:48:03 - DEBUG - Command to send: c
o85
read
e

25-05-18 07:48:03 - DEBUG - Answer received: !yro119
25-05-18 07:48:03 - DEBUG - Command to send: c
o119
table
sinvoice_line_items
e

25/05/18 07:48:03 INFO  HiveMetaStore:781 0: get_database: default
25/05/18 07:48:03 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_database: default	
25/05/18 07:48:03 INFO  HiveMetaStore:781 0: get_table : db=default tbl=invoice_line_items
25/05/18 07:48:03 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_table : db=default tbl=invoice_line_items	
25/05/18 07:48:03 INFO  HiveMetaStore:781 0: get_table : db=default tbl=invoice_line_items
25/05/18 07:48:03 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_table : db=default tbl=invoice_line_items	
25-05-18 07:48:03 - DEBUG - Answer received: !yro120
25-05-18 07:48:03 - DEBUG - Command to send: c
o120
showString
i20
i20
bFalse
e

25/05/18 07:48:03 INFO  PrepareDeltaScan:95 DELTA: Filtering files for query
25/05/18 07:48:03 INFO  MemorySt

+-------------+-------------+-------+------+------------+-------------+-------------+---------+-------+--------------+--------+-------------------+---------+-------+----------+
|InvoiceNumber|  CreatedTime|StoreID| PosID|CustomerType|PaymentMethod| DeliveryType|     City|PinCode|         State|ItemCode|    ItemDescription|ItemPrice|ItemQty|TotalValue|
+-------------+-------------+-------+------+------------+-------------+-------------+---------+-------+--------------+--------+-------------------+---------+-------+----------+
|     51402977|1595688900348|STR7188|POS956|       PRIME|         CARD|     TAKEAWAY|     NULL|   NULL|          NULL|     458|         Wine glass|   1644.0|      2|    3288.0|
|     51402977|1595688900348|STR7188|POS956|       PRIME|         CARD|     TAKEAWAY|     NULL|   NULL|          NULL|     283|     Portable Lamps|   2236.0|      1|    2236.0|
|     51402977|1595688900348|STR7188|POS956|       PRIME|         CARD|     TAKEAWAY|     NULL|   NULL|          NU

In [7]:
df = spark.read.table(table_name)

25-05-18 07:48:49 - DEBUG - Command to send: c
o85
read
e

25-05-18 07:48:49 - DEBUG - Answer received: !yro121
25-05-18 07:48:49 - DEBUG - Command to send: c
o121
table
sinvoice_line_items
e

25/05/18 07:48:49 INFO  HiveMetaStore:781 0: get_database: default
25/05/18 07:48:49 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_database: default	
25/05/18 07:48:49 INFO  HiveMetaStore:781 0: get_table : db=default tbl=invoice_line_items
25/05/18 07:48:49 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_table : db=default tbl=invoice_line_items	
25/05/18 07:48:49 INFO  HiveMetaStore:781 0: get_table : db=default tbl=invoice_line_items
25/05/18 07:48:49 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_table : db=default tbl=invoice_line_items	
25-05-18 07:48:49 - DEBUG - Answer received: !yro122


In [8]:
df.count()

25-05-18 07:49:02 - DEBUG - Command to send: c
o122
count
e

25/05/18 07:49:02 INFO  CodeGenerator:60 Code generated in 15.966991 ms
25/05/18 07:49:02 INFO  DAGScheduler:60 Registering RDD 28 (count at NativeMethodAccessorImpl.java:0) as input to shuffle 2
25/05/18 07:49:02 INFO  DAGScheduler:60 Got map stage job 5 (count at NativeMethodAccessorImpl.java:0) with 50 output partitions
25/05/18 07:49:02 INFO  DAGScheduler:60 Final stage: ShuffleMapStage 10 (count at NativeMethodAccessorImpl.java:0)
25/05/18 07:49:02 INFO  DAGScheduler:60 Parents of final stage: List(ShuffleMapStage 9)
25/05/18 07:49:02 INFO  DAGScheduler:60 Missing parents: List()
25/05/18 07:49:02 INFO  DAGScheduler:60 Submitting ShuffleMapStage 10 (MapPartitionsRDD[28] at count at NativeMethodAccessorImpl.java:0), which has no missing parents
25/05/18 07:49:02 INFO  MemoryStore:60 Block broadcast_8 stored as values in memory (estimated size 711.5 KiB, free 432.3 MiB)
25/05/18 07:49:02 INFO  MemoryStore:60 Block broadcas

4016

In [9]:
spark.stop()

25-05-18 07:49:10 - DEBUG - Command to send: c
o74
stop
e

25/05/18 07:49:10 INFO  SparkContext:60 SparkContext is stopping with exitCode 0.
25/05/18 07:49:10 INFO  SparkUI:60 Stopped Spark web UI at http://163d156376d2:4040
25/05/18 07:49:10 INFO  StandaloneSchedulerBackend:60 Shutting down all executors
25/05/18 07:49:10 INFO  StandaloneSchedulerBackend$StandaloneDriverEndpoint:60 Asking each executor to shut down
25/05/18 07:49:10 INFO  MapOutputTrackerMasterEndpoint:60 MapOutputTrackerMasterEndpoint stopped!
25/05/18 07:49:10 INFO  MemoryStore:60 MemoryStore cleared
25/05/18 07:49:10 INFO  BlockManager:60 BlockManager stopped
25/05/18 07:49:10 INFO  BlockManagerMaster:60 BlockManagerMaster stopped
25/05/18 07:49:10 INFO  OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:60 OutputCommitCoordinator stopped!
25/05/18 07:49:10 INFO  SparkContext:60 Successfully stopped SparkContext
25-05-18 07:49:10 - DEBUG - Answer received: !yv
25-05-18 07:49:10 - DEBUG - Command to send: r
u
S