In [1]:
spark.stop()

25/05/18 07:13:08 INFO  SparkContext:60 SparkContext is stopping with exitCode 0.
25/05/18 07:13:08 INFO  SparkUI:60 Stopped Spark web UI at http://9fff83ac849a:4040
25/05/18 07:13:08 INFO  StandaloneSchedulerBackend:60 Shutting down all executors
25/05/18 07:13:08 INFO  StandaloneSchedulerBackend$StandaloneDriverEndpoint:60 Asking each executor to shut down
25/05/18 07:13:08 INFO  MapOutputTrackerMasterEndpoint:60 MapOutputTrackerMasterEndpoint stopped!
25/05/18 07:13:08 INFO  MemoryStore:60 MemoryStore cleared
25/05/18 07:13:08 INFO  BlockManager:60 BlockManager stopped
25/05/18 07:13:08 INFO  BlockManagerMaster:60 BlockManagerMaster stopped
25/05/18 07:13:08 INFO  OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:60 OutputCommitCoordinator stopped!
25/05/18 07:13:08 INFO  SparkContext:60 Successfully stopped SparkContext


In [2]:
from pyspark.sql.functions import explode, split, trim, lower, expr
from pyspark.sql import SparkSession, DataFrame
from pyspark import SparkConf
from pyspark import SparkContext
import logging
from os.path import abspath
from pathlib import Path
import shutil
from pathlib import Path
from typing import Optional, Union, List, Tuple, Any

In [3]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%y-%m-%d %H:%M:%S",
    level=logging.DEBUG
)

In [4]:
class InvoiceStream:
    def __init__(
        self,
        spark: SparkSession
    ):
        self.spark = spark

    def read_invoices(
        self,
        format: str,
        path: Union[str, Path],
        schema: Union[str, Any]
    ) -> DataFrame:
        if isinstance(path, str):
            path = Path(path).as_posix()
        return (self.spark.readStream
                .format(format)
                .schema(schema)
                .load(path))

    def explode_invoices(self, df: DataFrame) -> DataFrame:
        return (
            df.selectExpr(
                "InvoiceNumber",
                "CreatedTime",
                "StoreID",
                "PosID",
                "CustomerType",
                "PaymentMethod",
                "DeliveryType",
                "DeliveryAddress.City",
                "DeliveryAddress.PinCode",
                "DeliveryAddress.State",
                "explode(InvoiceLineItems) AS LineItem"
            )
        )

    def flatten_invoices(self, df: DataFrame) -> DataFrame:
        return (
            df.withColumn("ItemCode", expr("LineItem.ItemCode"))
            .withColumn("ItemDescription", expr("LineItem.ItemDescription"))
            .withColumn("ItemPrice", expr("LineItem.ItemPrice"))
            .withColumn("ItemQty", expr("LineItem.ItemQty"))
            .withColumn("TotalValue", expr("LineItem.TotalValue"))
            .drop("LineItem")
        )

    def write_invoices(
        self,
        df: DataFrame,
        format: str,
        checkpoint_location: str,
        output_mode: str,
        table: str
    ):
        return (
            df.writeStream
            .format(format)
            .option("checkpointLocation", checkpoint_location)
            .outputMode(output_mode)
            .toTable(table)
        )

25-05-18 07:13:13 - DEBUG - Command to send: m
d
o39
e

25-05-18 07:13:13 - DEBUG - Answer received: !yv
25-05-18 07:13:13 - DEBUG - Command to send: m
d
o40
e

25-05-18 07:13:13 - DEBUG - Answer received: !yv
25-05-18 07:13:13 - DEBUG - Command to send: m
d
o41
e

25-05-18 07:13:13 - DEBUG - Answer received: !yv
25-05-18 07:13:13 - DEBUG - Command to send: m
d
o42
e

25-05-18 07:13:13 - DEBUG - Answer received: !yv
25-05-18 07:13:13 - DEBUG - Command to send: m
d
o45
e

25-05-18 07:13:13 - DEBUG - Answer received: !yv
25-05-18 07:13:13 - DEBUG - Command to send: m
d
o46
e

25-05-18 07:13:13 - DEBUG - Answer received: !yv
25-05-18 07:13:13 - DEBUG - Command to send: m
d
o47
e

25-05-18 07:13:13 - DEBUG - Answer received: !yv


In [5]:
if __name__ == "__main__":
    table_name = "invoice_line_items"
    schema = """
        InvoiceNumber string,
        CreatedTime bigint,
        StoreID string,
        PosID string,
        CashierID string,
        CustomerType string,
        CustomerCardNo string,
        TotalAmount double,
        NumberOfItems bigint,
        PaymentMethod string,
        TaxableAmount double,
        CGST double,
        SGST double,
        CESS double,
        DeliveryType string,
        DeliveryAddress struct<
            AddressLine string,
            City string,
            ContactNumber string,
            PinCode string,
            State string
        >,
        InvoiceLineItems array<
            struct<
                ItemCode string,
                ItemDescription string,
                ItemPrice double,
                ItemQty bigint,
                TotalValue double
            >
        >
    """
    spark = (
        SparkSession.builder
        .appName("InvoicesStream")
        .enableHiveSupport()
        .getOrCreate()
    )
    invoices_stream = InvoiceStream(spark)
    invoices_df = invoices_stream.read_invoices(
        format="json",
        path="/opt/spark/datasets/invoices/*.json",
        schema=schema
    )
    exploded_df = invoices_stream.explode_invoices(invoices_df)
    flatten_df = invoices_stream.flatten_invoices(exploded_df)
    squery = invoices_stream.write_invoices(
        df=flatten_df,
        format="delta",
        checkpoint_location="/opt/spark/datasets/checkpoint/invoices",
        output_mode="append",
        table=table_name
    )

25-05-18 07:13:14 - DEBUG - Command to send: r
u
SparkConf
rj
e

25-05-18 07:13:14 - DEBUG - Answer received: !ycorg.apache.spark.SparkConf
25-05-18 07:13:14 - DEBUG - Command to send: i
org.apache.spark.SparkConf
bTrue
e

25-05-18 07:13:14 - DEBUG - Answer received: !yro48
25-05-18 07:13:14 - DEBUG - Command to send: c
o48
set
sspark.app.name
sInvoicesStream
e

25-05-18 07:13:14 - DEBUG - Answer received: !yro49
25-05-18 07:13:14 - DEBUG - Command to send: c
o48
set
sspark.sql.catalogImplementation
shive
e

25-05-18 07:13:14 - DEBUG - Answer received: !yro50
25-05-18 07:13:14 - DEBUG - Command to send: c
o48
get
sspark.executor.allowSparkContext
sfalse
e

25-05-18 07:13:14 - DEBUG - Answer received: !ysfalse
25-05-18 07:13:14 - DEBUG - Command to send: c
o48
contains
sspark.serializer.objectStreamReset
e

25-05-18 07:13:14 - DEBUG - Answer received: !ybfalse
25-05-18 07:13:14 - DEBUG - Command to send: c
o48
set
sspark.serializer.objectStreamReset
s100
e

25-05-18 07:13:14 - DEBUG - A

In [6]:
spark.read.table(table_name).show()

25-05-18 07:13:31 - DEBUG - Command to send: c
o85
read
e

25-05-18 07:13:31 - DEBUG - Answer received: !yro116
25-05-18 07:13:31 - DEBUG - Command to send: c
o116
table
sinvoice_line_items
e

25/05/18 07:13:31 INFO  HiveMetaStore:781 0: get_database: default
25/05/18 07:13:31 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_database: default	
25/05/18 07:13:31 INFO  HiveMetaStore:781 0: get_table : db=default tbl=invoice_line_items
25/05/18 07:13:31 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_table : db=default tbl=invoice_line_items	
25/05/18 07:13:31 INFO  CodeGenerator:60 Code generated in 230.245101 ms
25/05/18 07:13:31 INFO  HiveMetaStore:781 0: get_table : db=default tbl=invoice_line_items
25/05/18 07:13:31 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_table : db=default tbl=invoice_line_items	
25/05/18 07:13:31 INFO  SparkContext:60 Starting job: toTable at NativeMethodAccessorImpl.java:0
25/05/18 07:13:31 INFO  DAGScheduler:60 Job 1 finished: toTable at Native

+-------------+-------------+-------+------+------------+-------------+-------------+---------+-------+--------------+--------+-------------------+---------+-------+----------+
|InvoiceNumber|  CreatedTime|StoreID| PosID|CustomerType|PaymentMethod| DeliveryType|     City|PinCode|         State|ItemCode|    ItemDescription|ItemPrice|ItemQty|TotalValue|
+-------------+-------------+-------+------+------------+-------------+-------------+---------+-------+--------------+--------+-------------------+---------+-------+----------+
|     51402977|1595688900348|STR7188|POS956|       PRIME|         CARD|     TAKEAWAY|     NULL|   NULL|          NULL|     458|         Wine glass|   1644.0|      2|    3288.0|
|     51402977|1595688900348|STR7188|POS956|       PRIME|         CARD|     TAKEAWAY|     NULL|   NULL|          NULL|     283|     Portable Lamps|   2236.0|      1|    2236.0|
|     51402977|1595688900348|STR7188|POS956|       PRIME|         CARD|     TAKEAWAY|     NULL|   NULL|          NU

25/05/18 07:13:39 INFO  InMemoryFileIndex:60 It took 21 ms to list leaf files for 4 paths.
25/05/18 07:13:39 INFO  InMemoryFileIndex:60 It took 18 ms to list leaf files for 4 paths.
25/05/18 07:13:39 INFO  InMemoryFileIndex:60 It took 16 ms to list leaf files for 4 paths.
25/05/18 07:13:39 INFO  InMemoryFileIndex:60 It took 17 ms to list leaf files for 4 paths.
25/05/18 07:13:39 INFO  InMemoryFileIndex:60 It took 16 ms to list leaf files for 4 paths.
25/05/18 07:13:39 INFO  InMemoryFileIndex:60 It took 19 ms to list leaf files for 4 paths.
25/05/18 07:13:39 INFO  InMemoryFileIndex:60 It took 17 ms to list leaf files for 4 paths.
25/05/18 07:13:39 INFO  InMemoryFileIndex:60 It took 17 ms to list leaf files for 4 paths.
25/05/18 07:13:39 INFO  InMemoryFileIndex:60 It took 18 ms to list leaf files for 4 paths.
25/05/18 07:13:39 INFO  InMemoryFileIndex:60 It took 27 ms to list leaf files for 4 paths.
25/05/18 07:13:39 INFO  InMemoryFileIndex:60 It took 18 ms to list leaf files for 4 paths.

In [7]:
df = spark.read.table(table_name)

25-05-18 07:14:33 - DEBUG - Command to send: c
o85
read
e

25-05-18 07:14:33 - DEBUG - Answer received: !yro118
25-05-18 07:14:33 - DEBUG - Command to send: c
o118
table
sinvoice_line_items
e

25/05/18 07:14:33 INFO  HiveMetaStore:781 0: get_database: default
25/05/18 07:14:33 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_database: default	
25/05/18 07:14:33 INFO  HiveMetaStore:781 0: get_table : db=default tbl=invoice_line_items
25/05/18 07:14:33 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_table : db=default tbl=invoice_line_items	
25/05/18 07:14:33 INFO  InMemoryFileIndex:60 It took 23 ms to list leaf files for 4 paths.
25/05/18 07:14:33 INFO  HiveMetaStore:781 0: get_table : db=default tbl=invoice_line_items
25/05/18 07:14:33 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_table : db=default tbl=invoice_line_items	
25/05/18 07:14:33 INFO  InMemoryFileIndex:60 It took 19 ms to list leaf files for 4 paths.
25-05-18 07:14:33 - DEBUG - Answer received: !yro119
25/05/18

In [9]:
df.count()

25-05-18 07:15:06 - DEBUG - Command to send: c
o119
count
e

25/05/18 07:15:06 INFO  InMemoryFileIndex:60 It took 16 ms to list leaf files for 4 paths.
25/05/18 07:15:06 INFO  InMemoryFileIndex:60 It took 17 ms to list leaf files for 4 paths.
25/05/18 07:15:06 INFO  DAGScheduler:60 Registering RDD 40 (count at NativeMethodAccessorImpl.java:0) as input to shuffle 3
25/05/18 07:15:06 INFO  DAGScheduler:60 Got map stage job 9 (count at NativeMethodAccessorImpl.java:0) with 50 output partitions
25/05/18 07:15:06 INFO  DAGScheduler:60 Final stage: ShuffleMapStage 16 (count at NativeMethodAccessorImpl.java:0)
25/05/18 07:15:06 INFO  DAGScheduler:60 Parents of final stage: List(ShuffleMapStage 15)
25/05/18 07:15:06 INFO  DAGScheduler:60 Missing parents: List()
25/05/18 07:15:06 INFO  DAGScheduler:60 Submitting ShuffleMapStage 16 (MapPartitionsRDD[40] at count at NativeMethodAccessorImpl.java:0), which has no missing parents
25/05/18 07:15:06 INFO  MemoryStore:60 Block broadcast_12 stored as v

4016

25/05/18 07:15:06 INFO  InMemoryFileIndex:60 It took 15 ms to list leaf files for 4 paths.
25/05/18 07:15:06 INFO  InMemoryFileIndex:60 It took 17 ms to list leaf files for 4 paths.
25/05/18 07:15:06 INFO  InMemoryFileIndex:60 It took 18 ms to list leaf files for 4 paths.
25/05/18 07:15:07 INFO  InMemoryFileIndex:60 It took 13 ms to list leaf files for 4 paths.
25/05/18 07:15:07 INFO  InMemoryFileIndex:60 It took 15 ms to list leaf files for 4 paths.
25/05/18 07:15:07 INFO  InMemoryFileIndex:60 It took 16 ms to list leaf files for 4 paths.
25/05/18 07:15:07 INFO  InMemoryFileIndex:60 It took 15 ms to list leaf files for 4 paths.
25/05/18 07:15:07 INFO  InMemoryFileIndex:60 It took 15 ms to list leaf files for 4 paths.
25/05/18 07:15:07 INFO  InMemoryFileIndex:60 It took 15 ms to list leaf files for 4 paths.
25/05/18 07:15:07 INFO  InMemoryFileIndex:60 It took 15 ms to list leaf files for 4 paths.
25/05/18 07:15:07 INFO  InMemoryFileIndex:60 It took 15 ms to list leaf files for 4 paths.

In [10]:
spark.stop()

25-05-18 07:15:26 - DEBUG - Command to send: c
o74
stop
e

25/05/18 07:15:26 INFO  SparkContext:60 SparkContext is stopping with exitCode 0.
25/05/18 07:15:26 INFO  SparkUI:60 Stopped Spark web UI at http://9fff83ac849a:4040
25/05/18 07:15:26 INFO  StandaloneSchedulerBackend:60 Shutting down all executors
25/05/18 07:15:26 INFO  StandaloneSchedulerBackend$StandaloneDriverEndpoint:60 Asking each executor to shut down
25/05/18 07:15:26 INFO  MapOutputTrackerMasterEndpoint:60 MapOutputTrackerMasterEndpoint stopped!
25/05/18 07:15:26 INFO  MemoryStore:60 MemoryStore cleared
25/05/18 07:15:26 INFO  BlockManager:60 BlockManager stopped
25/05/18 07:15:26 INFO  BlockManagerMaster:60 BlockManagerMaster stopped
25/05/18 07:15:26 INFO  OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:60 OutputCommitCoordinator stopped!
25/05/18 07:15:26 INFO  SparkContext:60 Successfully stopped SparkContext
25-05-18 07:15:26 - DEBUG - Answer received: !yv
25/05/18 07:15:26 INFO  InMemoryFileIndex:60 It to