In [1]:
spark.stop()

25/05/18 07:53:49 INFO  SparkContext:60 SparkContext is stopping with exitCode 0.
25/05/18 07:53:49 INFO  SparkUI:60 Stopped Spark web UI at http://163d156376d2:4040
25/05/18 07:53:49 INFO  StandaloneSchedulerBackend:60 Shutting down all executors
25/05/18 07:53:49 INFO  StandaloneSchedulerBackend$StandaloneDriverEndpoint:60 Asking each executor to shut down
25/05/18 07:53:49 INFO  MapOutputTrackerMasterEndpoint:60 MapOutputTrackerMasterEndpoint stopped!
25/05/18 07:53:49 INFO  MemoryStore:60 MemoryStore cleared
25/05/18 07:53:49 INFO  BlockManager:60 BlockManager stopped
25/05/18 07:53:49 INFO  BlockManagerMaster:60 BlockManagerMaster stopped
25/05/18 07:53:49 INFO  OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:60 OutputCommitCoordinator stopped!
25/05/18 07:53:49 INFO  SparkContext:60 Successfully stopped SparkContext


In [2]:
from pyspark.sql.functions import explode, split, trim, lower, expr
from pyspark.sql import SparkSession, DataFrame
from pyspark import SparkConf
from pyspark import SparkContext
import logging
from os.path import abspath
from pathlib import Path
import shutil
from pathlib import Path
from typing import Optional, Union, List, Tuple, Any, Literal

In [3]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%y-%m-%d %H:%M:%S",
    level=logging.DEBUG
)

In [4]:
class InvoiceStreamBronze:
    def __init__(
        self,
        spark: SparkSession
    ):
        self.spark = spark

    def read_invoices(
        self,
        format: str,
        path: Union[str, Path],
        schema: Union[str, Any],
        clean_source: Literal["delete", "archive"],
        archive_dir: Optional[str],
    ) -> DataFrame:
        if isinstance(path, str):
            path = Path(path).as_posix()
        if clean_source == "archive":
            return (
                self.spark.readStream
                .format(format)
                .schema(schema)
                .option("cleanSource", "archive")
                .option("sourceArchiveDir", archive_dir)
                .load(path)
            )
        return (
            self.spark.readStream
            .format(format)
            .schema(schema)
            .option("cleanSource", clean_source)
            .load(path)
        )

    def write_invoices(
        self,
        df: DataFrame,
        format: str,
        checkpoint_location: str,
        output_mode: str,
        table: str,
        query_name: str
    ):
        return (
            df.writeStream
            .queryName(query_name)
            .format(format)
            .option("checkpointLocation", checkpoint_location)
            .outputMode(output_mode)
            .toTable(table)
        )

In [5]:
class InvoiceStreamSilver:
    def __init__(
        self,
        spark: SparkSession
    ):
        self.spark = spark

    def read_invoices(self, table_name: str) -> DataFrame:
        return (
            self.spark.readStream
            .table(table_name)
        )

    def explode_invoices(self, df: DataFrame) -> DataFrame:
        return (
            df.selectExpr(
                "InvoiceNumber",
                "CreatedTime",
                "StoreID",
                "PosID",
                "CustomerType",
                "PaymentMethod",
                "DeliveryType",
                "DeliveryAddress.City",
                "DeliveryAddress.PinCode",
                "DeliveryAddress.State",
                "explode(InvoiceLineItems) AS LineItem"
            )
        )

    def flatten_invoices(self, df: DataFrame) -> DataFrame:
        return (
            df.withColumn("ItemCode", expr("LineItem.ItemCode"))
            .withColumn("ItemDescription", expr("LineItem.ItemDescription"))
            .withColumn("ItemPrice", expr("LineItem.ItemPrice"))
            .withColumn("ItemQty", expr("LineItem.ItemQty"))
            .withColumn("TotalValue", expr("LineItem.TotalValue"))
            .drop("LineItem")
        )

    def write_invoices(
        self,
        df: DataFrame,
        format: str,
        checkpoint_location: str,
        output_mode: str,
        table: str,
        query_name: str,
    ):
        return (
            df.writeStream
            .queryName(query_name)
            .format(format)
            .option("checkpointLocation", checkpoint_location)
            .outputMode(output_mode)
            .toTable(table)
        )

In [6]:
if __name__ == "__main__":
    table_name = "invoice_line_items"
    schema = """
        InvoiceNumber string,
        CreatedTime bigint,
        StoreID string,
        PosID string,
        CashierID string,
        CustomerType string,
        CustomerCardNo string,
        TotalAmount double,
        NumberOfItems bigint,
        PaymentMethod string,
        TaxableAmount double,
        CGST double,
        SGST double,
        CESS double,
        DeliveryType string,
        DeliveryAddress struct<
            AddressLine string,
            City string,
            ContactNumber string,
            PinCode string,
            State string
        >,
        InvoiceLineItems array<
            struct<
                ItemCode string,
                ItemDescription string,
                ItemPrice double,
                ItemQty bigint,
                TotalValue double
            >
        >
    """
    spark = (
        SparkSession.builder
        .appName("InvoicesStreamMedallion")
        .enableHiveSupport()
        .getOrCreate()
    )
    invoices_bronze = InvoiceStreamBronze(spark)
    invoices_df = invoices_bronze.read_invoices(
        format="json",
        path="/opt/spark/datasets/invoices/*.json",
        schema=schema,
        clean_source="archive",
        archive_dir="/opt/spark/datasets/archive/invoices"
    )
    squery_bronze = invoices_bronze.write_invoices(
        df=invoices_df,
        format="delta",
        checkpoint_location="/opt/spark/datasets/checkpoint/invoices/bronze",
        output_mode="append",
        table="invoices_bronze",
        query_name="ingestion_bronze"
    )

    invoices_silver = InvoiceStreamSilver(spark)
    exploded_df = invoices_silver.explode_invoices(invoices_df)
    flatten_df = invoices_silver.flatten_invoices(exploded_df)
    squery_silver = invoices_silver.write_invoices(
        df=flatten_df,
        format="delta",
        checkpoint_location="/opt/spark/datasets/checkpoint/invoices/silver",
        output_mode="append",
        table="invoices_silver",
        query_name="ingestion_silver",
    )

25-05-18 07:54:00 - DEBUG - Command to send: r
u
SparkConf
rj
e

25-05-18 07:54:00 - DEBUG - Answer received: !ycorg.apache.spark.SparkConf
25-05-18 07:54:00 - DEBUG - Command to send: i
org.apache.spark.SparkConf
bTrue
e

25-05-18 07:54:00 - DEBUG - Answer received: !yro48
25-05-18 07:54:00 - DEBUG - Command to send: c
o48
set
sspark.app.name
sInvoicesStreamMedallion
e

25-05-18 07:54:00 - DEBUG - Answer received: !yro49
25-05-18 07:54:00 - DEBUG - Command to send: c
o48
set
sspark.sql.catalogImplementation
shive
e

25-05-18 07:54:00 - DEBUG - Answer received: !yro50
25-05-18 07:54:00 - DEBUG - Command to send: c
o48
get
sspark.executor.allowSparkContext
sfalse
e

25-05-18 07:54:00 - DEBUG - Answer received: !ysfalse
25-05-18 07:54:00 - DEBUG - Command to send: c
o48
contains
sspark.serializer.objectStreamReset
e

25-05-18 07:54:00 - DEBUG - Answer received: !ybfalse
25-05-18 07:54:00 - DEBUG - Command to send: c
o48
set
sspark.serializer.objectStreamReset
s100
e

25-05-18 07:54:00 - 

In [7]:
df = spark.read.table("invoices_bronze")

25-05-18 07:54:51 - DEBUG - Command to send: c
o85
read
e

25-05-18 07:54:51 - DEBUG - Answer received: !yro125
25-05-18 07:54:51 - DEBUG - Command to send: c
o125
table
sinvoices_bronze
e

25/05/18 07:54:51 INFO  HiveMetaStore:781 0: get_database: default
25/05/18 07:54:51 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_database: default	
25/05/18 07:54:51 INFO  HiveMetaStore:781 0: get_table : db=default tbl=invoices_bronze
25/05/18 07:54:51 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_table : db=default tbl=invoices_bronze	
25/05/18 07:54:51 INFO  InMemoryFileIndex:60 It took 6 ms to list leaf files for 4 paths.
25/05/18 07:54:51 INFO  InMemoryFileIndex:60 It took 6 ms to list leaf files for 4 paths.
25/05/18 07:54:51 INFO  HiveMetaStore:781 0: get_table : db=default tbl=invoices_bronze
25/05/18 07:54:51 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_table : db=default tbl=invoices_bronze	
25/05/18 07:54:51 INFO  InMemoryFileIndex:60 It took 5 ms to list leaf files f

In [8]:
df.count()

25-05-18 07:55:01 - DEBUG - Command to send: c
o126
count
e

25/05/18 07:55:01 INFO  InMemoryFileIndex:60 It took 6 ms to list leaf files for 4 paths.
25/05/18 07:55:01 INFO  InMemoryFileIndex:60 It took 16 ms to list leaf files for 4 paths.
25/05/18 07:55:01 INFO  InMemoryFileIndex:60 It took 7 ms to list leaf files for 4 paths.
25/05/18 07:55:01 INFO  InMemoryFileIndex:60 It took 15 ms to list leaf files for 4 paths.
25/05/18 07:55:01 INFO  MemoryStore:60 Block broadcast_4 stored as values in memory (estimated size 201.7 KiB, free 433.7 MiB)
25/05/18 07:55:01 INFO  MemoryStore:60 Block broadcast_4_piece0 stored as bytes in memory (estimated size 34.9 KiB, free 433.7 MiB)
25/05/18 07:55:01 INFO  BlockManagerInfo:60 Added broadcast_4_piece0 in memory on 163d156376d2:36329 (size: 34.9 KiB, free: 434.3 MiB)
25/05/18 07:55:01 INFO  SparkContext:60 Created broadcast 4 from count at NativeMethodAccessorImpl.java:0
25/05/18 07:55:01 INFO  InMemoryFileIndex:60 It took 14 ms to list leaf files

1601

25/05/18 07:55:07 INFO  InMemoryFileIndex:60 It took 19 ms to list leaf files for 4 paths.
25/05/18 07:55:07 INFO  InMemoryFileIndex:60 It took 8 ms to list leaf files for 4 paths.
25/05/18 07:55:07 INFO  InMemoryFileIndex:60 It took 20 ms to list leaf files for 4 paths.
25/05/18 07:55:07 INFO  InMemoryFileIndex:60 It took 14 ms to list leaf files for 4 paths.
25/05/18 07:55:07 INFO  InMemoryFileIndex:60 It took 17 ms to list leaf files for 4 paths.
25/05/18 07:55:07 INFO  InMemoryFileIndex:60 It took 15 ms to list leaf files for 4 paths.
25/05/18 07:55:07 INFO  InMemoryFileIndex:60 It took 17 ms to list leaf files for 4 paths.
25/05/18 07:55:07 INFO  InMemoryFileIndex:60 It took 14 ms to list leaf files for 4 paths.
25/05/18 07:55:07 INFO  InMemoryFileIndex:60 It took 15 ms to list leaf files for 4 paths.
25/05/18 07:55:07 INFO  InMemoryFileIndex:60 It took 13 ms to list leaf files for 4 paths.
25/05/18 07:55:07 INFO  InMemoryFileIndex:60 It took 15 ms to list leaf files for 4 paths.
