In [7]:
spark.stop()

25/05/18 05:07:52 INFO  SparkContext:60 SparkContext is stopping with exitCode 0.
25/05/18 05:07:52 INFO  SparkUI:60 Stopped Spark web UI at http://9fff83ac849a:4040
25/05/18 05:07:52 INFO  StandaloneSchedulerBackend:60 Shutting down all executors
25/05/18 05:07:52 INFO  StandaloneSchedulerBackend$StandaloneDriverEndpoint:60 Asking each executor to shut down
25/05/18 05:07:52 INFO  MapOutputTrackerMasterEndpoint:60 MapOutputTrackerMasterEndpoint stopped!
25/05/18 05:07:52 INFO  MemoryStore:60 MemoryStore cleared
25/05/18 05:07:52 INFO  BlockManager:60 BlockManager stopped
25/05/18 05:07:52 INFO  BlockManagerMaster:60 BlockManagerMaster stopped
25/05/18 05:07:52 INFO  OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:60 OutputCommitCoordinator stopped!
25/05/18 05:07:52 INFO  SparkContext:60 Successfully stopped SparkContext


In [8]:
from pyspark.sql.functions import explode, split, trim, lower
from pyspark.sql import SparkSession, DataFrame
from pyspark import SparkConf
from pyspark import SparkContext
import logging
from os.path import abspath
from pathlib import Path
import shutil
from delta import *

In [9]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%y-%m-%d %H:%M:%S",
    level=logging.DEBUG
)

In [10]:
class BatchWordCount:
    def __init__(
        self,
        spark: SparkSession
    ):
        self.spark = spark

    def read_text(
        self,
        path: str,
        format: str = "text",
        line_sep: str = "."
    ) -> DataFrame:
        lines = (
            self.spark.read
            .format(format)
            .option("lineSep", line_sep)
            .load(path)
        )
        raw_sdf = lines.select(explode(split(lines.value, " ")).alias("word"))
        return raw_sdf

    def process_text(
        self,
        raw_sdf: DataFrame
    ) -> DataFrame:
        processed_sdf = (
            raw_sdf.select(lower(trim(raw_sdf.word)).alias("word"))
            .where("word is not null")
            .where("word rlike '[a-z]'")
        )
        return processed_sdf

    def count_words(
        self,
        processed_sdf: DataFrame
    ) -> DataFrame:
        sdf = processed_sdf.groupBy("word").count()
        return sdf

    def write_table(
        self,
        sdf: DataFrame,
        format: str,
        mode: str,
        table_name: str
    ):
        (
            sdf.write
            .format(format)
            .mode(mode)
            .saveAsTable(table_name)
        )


In [11]:
class StreamWordCount:
    def __init__(
        self,
        spark: SparkSession
    ):
        self.spark = spark

    def read_text(
        self,
        path: str,
        format: str = "text",
        line_sep: str = "."
    ):
        lines = (
            self.spark.readStream
            .format(format)
            .option("lineSep", line_sep)
            .load(path)
        )
        raw_sdf = lines.select(explode(split(lines.value, " ")).alias("word"))
        return raw_sdf

    def process_text(
        self,
        raw_sdf: DataFrame
    ) -> DataFrame:
        processed_sdf = (
            raw_sdf.select(lower(trim(raw_sdf.word)).alias("word"))
            .where("word is not null")
            .where("word rlike '[a-z]'")
        )
        return processed_sdf

    def count_words(
        self,
        processed_sdf: DataFrame
    ) -> DataFrame:
        sdf = processed_sdf.groupBy("word").count()
        return sdf

    def write_table(
        self,
        sdf: DataFrame,
        format: str,
        output_mode: str,
        table_name: str,
        checkpoint_location: str
    ):
        squery = (
            sdf.writeStream
            .format(format)
            .option("truncate", value=False)
            .option("checkpointLocation", checkpoint_location)
            .outputMode(output_mode)
            .toTable(table_name)
            .start()
            .awaitTermination()
        )
        return squery

In [7]:
spark = (
    SparkSession.builder
    .appName("batching_word_count")
    .enableHiveSupport()
    .getOrCreate()
)

25-05-18 05:00:33 - DEBUG - Command to send: r
u
SparkConf
rj
e

25-05-18 05:00:33 - DEBUG - Answer received: !ycorg.apache.spark.SparkConf
25-05-18 05:00:33 - DEBUG - Command to send: i
org.apache.spark.SparkConf
bTrue
e

25-05-18 05:00:33 - DEBUG - Answer received: !yro48
25-05-18 05:00:33 - DEBUG - Command to send: c
o48
set
sspark.app.name
sbatching_word_count
e

25-05-18 05:00:33 - DEBUG - Answer received: !yro49
25-05-18 05:00:33 - DEBUG - Command to send: c
o48
set
sspark.sql.catalogImplementation
shive
e

25-05-18 05:00:33 - DEBUG - Answer received: !yro50
25-05-18 05:00:33 - DEBUG - Command to send: c
o48
get
sspark.executor.allowSparkContext
sfalse
e

25-05-18 05:00:33 - DEBUG - Answer received: !ysfalse
25-05-18 05:00:33 - DEBUG - Command to send: c
o48
contains
sspark.serializer.objectStreamReset
e

25-05-18 05:00:33 - DEBUG - Answer received: !ybfalse
25-05-18 05:00:33 - DEBUG - Command to send: c
o48
set
sspark.serializer.objectStreamReset
s100
e

25-05-18 05:00:33 - DEBU

In [8]:
batch = BatchWordCount(spark)
raw_sdf = batch.read_text(path="/opt/spark/datasets/text/*.txt")
processed_sdf = batch.process_text(raw_sdf)
sdf = batch.count_words(processed_sdf)
batch.write_table(
    sdf,
    format="delta",
    mode="overwrite",
    table_name="batching_word_count_table"
)


25-05-18 05:00:39 - DEBUG - Command to send: c
o85
read
e

25/05/18 05:00:39 INFO  SharedState:60 Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
25/05/18 05:00:39 INFO  SharedState:60 Warehouse path is 'file:/opt/spark/warehouse'.
25-05-18 05:00:40 - DEBUG - Answer received: !yro86
25-05-18 05:00:40 - DEBUG - Command to send: c
o86
format
stext
e

25-05-18 05:00:40 - DEBUG - Answer received: !yro87
25-05-18 05:00:40 - DEBUG - Command to send: c
o87
option
slineSep
s.
e

25-05-18 05:00:40 - DEBUG - Answer received: !yro88
25-05-18 05:00:40 - DEBUG - Command to send: c
o88
load
s/opt/spark/datasets/text/*.txt
e

25/05/18 05:00:40 INFO  InMemoryFileIndex:60 It took 50 ms to list leaf files for 3 paths.
25-05-18 05:00:41 - DEBUG - Answer received: !yro89
25-05-18 05:00:41 - DEBUG - Command to send: c
o89
schema
e

25-05-18 05:00:41 - DEBUG - Answer received: !yro90
25-05-18 05:00:41 - DEBUG - Command to send: c
o90
json
e

25-05-18 05:00:41 - DEBUG -

In [9]:
spark.read.table("batching_word_count_table").show()

25-05-18 05:00:56 - DEBUG - Command to send: c
o85
read
e

25-05-18 05:00:56 - DEBUG - Answer received: !yro116
25-05-18 05:00:56 - DEBUG - Command to send: c
o116
table
sbatching_word_count_table
e

25/05/18 05:00:56 INFO  HiveMetaStore:781 0: get_database: default
25/05/18 05:00:56 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_database: default	
25/05/18 05:00:56 INFO  HiveMetaStore:781 0: get_table : db=default tbl=batching_word_count_table
25/05/18 05:00:56 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_table : db=default tbl=batching_word_count_table	
25/05/18 05:00:56 INFO  HiveMetaStore:781 0: get_table : db=default tbl=batching_word_count_table
25/05/18 05:00:56 INFO  audit:309 ugi=root	ip=unknown-ip-addr	cmd=get_table : db=default tbl=batching_word_count_table	
25-05-18 05:00:56 - DEBUG - Answer received: !yro117
25-05-18 05:00:56 - DEBUG - Command to send: c
o117
showString
i20
i20
bFalse
e

25/05/18 05:00:56 INFO  PrepareDeltaScan:95 DELTA: Filtering files for que

+-----------+-----+
|       word|count|
+-----------+-----+
|     first,|    1|
|    explain|    1|
|      using|    2|
|        you|    5|
|    discuss|    1|
|   concepts|    1|
|       apis|    1|
|      query|    1|
|    example|    1|
|         in|    4|
|programming|    1|
|       with|    2|
|      count|    1|
|      model|    2|
|  streaming|    9|
|      later|    1|
|micro-batch|    2|
|      going|    2|
|      let’s|    1|
|        the|   15|
+-----------+-----+
only showing top 20 rows



In [10]:
spark.stop()

25-05-18 05:01:17 - DEBUG - Command to send: c
o74
stop
e

25/05/18 05:01:17 INFO  SparkContext:60 SparkContext is stopping with exitCode 0.
25/05/18 05:01:17 INFO  SparkUI:60 Stopped Spark web UI at http://9fff83ac849a:4040
25/05/18 05:01:17 INFO  StandaloneSchedulerBackend:60 Shutting down all executors
25/05/18 05:01:17 INFO  StandaloneSchedulerBackend$StandaloneDriverEndpoint:60 Asking each executor to shut down
25/05/18 05:01:17 INFO  MapOutputTrackerMasterEndpoint:60 MapOutputTrackerMasterEndpoint stopped!
25/05/18 05:01:17 INFO  MemoryStore:60 MemoryStore cleared
25/05/18 05:01:17 INFO  BlockManager:60 BlockManager stopped
25/05/18 05:01:17 INFO  BlockManagerMaster:60 BlockManagerMaster stopped
25/05/18 05:01:17 INFO  OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:60 OutputCommitCoordinator stopped!
25/05/18 05:01:17 INFO  SparkContext:60 Successfully stopped SparkContext
25-05-18 05:01:17 - DEBUG - Answer received: !yv
25-05-18 05:01:18 - DEBUG - Command to send: r
u
S

In [12]:
spark = (
    SparkSession.builder
    .appName("streaming_word_count")
    .enableHiveSupport()
    .getOrCreate()
)

25-05-18 05:08:31 - DEBUG - Command to send: r
u
SparkConf
rj
e

25-05-18 05:08:31 - DEBUG - Answer received: !ycorg.apache.spark.SparkConf
25-05-18 05:08:31 - DEBUG - Command to send: i
org.apache.spark.SparkConf
bTrue
e

25-05-18 05:08:31 - DEBUG - Answer received: !yro90
25-05-18 05:08:31 - DEBUG - Command to send: c
o90
set
sspark.app.name
sstreaming_word_count
e

25-05-18 05:08:31 - DEBUG - Answer received: !yro91
25-05-18 05:08:31 - DEBUG - Command to send: c
o90
set
sspark.sql.catalogImplementation
shive
e

25-05-18 05:08:31 - DEBUG - Answer received: !yro92
25-05-18 05:08:31 - DEBUG - Command to send: c
o90
get
sspark.executor.allowSparkContext
sfalse
e

25-05-18 05:08:31 - DEBUG - Answer received: !ysfalse
25-05-18 05:08:31 - DEBUG - Command to send: c
o90
contains
sspark.serializer.objectStreamReset
e

25-05-18 05:08:31 - DEBUG - Answer received: !ybfalse
25-05-18 05:08:31 - DEBUG - Command to send: c
o90
set
sspark.serializer.objectStreamReset
s100
e

25-05-18 05:08:31 - DEB

In [13]:
stream = StreamWordCount(spark)
raw_sdf = stream.read_text(path="/opt/spark/datasets/text/*.txt")
processed_sdf = stream.process_text(raw_sdf)
sdf = stream.count_words(processed_sdf)
squery = stream.write_table(
    sdf,
    format="delta",
    output_mode="complete",
    table_name="streaming_word_count_table",
    checkpoint_location="/opt/spark/datasets/checkpoint/word_count"
)


25-05-18 05:08:38 - DEBUG - Command to send: c
o127
readStream
e

25/05/18 05:08:38 INFO  SharedState:60 Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
25/05/18 05:08:38 INFO  SharedState:60 Warehouse path is 'file:/opt/spark/warehouse'.
25-05-18 05:08:38 - DEBUG - Answer received: !yro128
25-05-18 05:08:38 - DEBUG - Command to send: c
o128
format
stext
e

25-05-18 05:08:38 - DEBUG - Answer received: !yro129
25-05-18 05:08:38 - DEBUG - Command to send: c
o129
option
slineSep
s.
e

25-05-18 05:08:38 - DEBUG - Answer received: !yro130
25-05-18 05:08:38 - DEBUG - Command to send: c
o130
load
s/opt/spark/datasets/text/*.txt
e

25/05/18 05:08:38 INFO  InMemoryFileIndex:60 It took 25 ms to list leaf files for 3 paths.
25-05-18 05:08:38 - DEBUG - Answer received: !yro131
25-05-18 05:08:38 - DEBUG - Command to send: c
o131
schema
e

25-05-18 05:08:38 - DEBUG - Answer received: !yro132
25-05-18 05:08:38 - DEBUG - Command to send: c
o132
json
e

25-05-18 0

AttributeError: 'StreamingQuery' object has no attribute 'start'

25/05/18 05:08:39 INFO  FileStreamSourceLog:60 Set the compact interval to 10 [defaultCompactInterval: 10]
25/05/18 05:08:39 INFO  FileStreamSourceLog:60 BatchIds found from listing: 
25/05/18 05:08:39 INFO  FileStreamSourceLog:60 BatchIds found from listing: 
25/05/18 05:08:39 INFO  FileStreamSource:60 maxFilesPerBatch = None, maxFileAgeMs = 604800000
25/05/18 05:08:39 INFO  MicroBatchExecution:60 Using Source [FileStreamSource[file:/opt/spark/datasets/text/*.txt]] from DataSourceV1 named 'FileSource[/opt/spark/datasets/text/*.txt]' [DataSource(org.apache.spark.sql.SparkSession@71681217,text,List(),None,List(),None,Map(lineSep -> ., path -> /opt/spark/datasets/text/*.txt),None)]
25/05/18 05:08:39 INFO  OffsetSeqLog:60 BatchIds found from listing: 
25/05/18 05:08:39 INFO  OffsetSeqLog:60 BatchIds found from listing: 
25/05/18 05:08:39 INFO  MicroBatchExecution:60 Starting new streaming query.
25/05/18 05:08:39 INFO  MicroBatchExecution:60 Stream started from {}
25/05/18 05:08:39 INFO  

In [1]:
spark.read.table("streaming_word_count_table").show()

25/05/18 05:09:28 INFO  SharedState:60 Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
25/05/18 05:09:28 INFO  SharedState:60 Warehouse path is 'file:/opt/spark/warehouse'.
25/05/18 05:09:29 INFO  HiveUtils:60 Initializing HiveMetastoreConnection version 2.3.9 using Spark classes.
25/05/18 05:09:29 INFO  HiveClientImpl:60 Warehouse location for Hive client (version 2.3.9) is file:/opt/spark/warehouse
25/05/18 05:09:30 WARN  HiveConf:4122 HiveConf of name hive.stats.jdbc.timeout does not exist
25/05/18 05:09:30 WARN  HiveConf:4122 HiveConf of name hive.stats.retries.wait does not exist
25/05/18 05:09:30 INFO  HiveMetaStore:614 0: Opening raw store with implementation class:org.apache.hadoop.hive.metastore.ObjectStore
25/05/18 05:09:30 INFO  ObjectStore:403 ObjectStore, initialize called
25/05/18 05:09:30 INFO  StandaloneSchedulerBackend$StandaloneDriverEndpoint:60 Registered executor NettyRpcEndpointRef(spark-client://Executor) (172.18.0.4:33050) w

+---------------+-----+
|           word|count|
+---------------+-----+
|         stream|    2|
|fault-tolerance|    2|
|            low|    2|
|           then|    1|
|        default|    1|
|         scala,|    1|
|     event-time|    1|
|        explain|    1|
|        achieve|    1|
|         engine|    4|
|          since|    1|
|     processing|    6|
|        engine,|    1|
|         python|    1|
|          count|    1|
|           care|    1|
|            the|   15|
|           mode|    2|
|           logs|    1|
|           user|    1|
+---------------+-----+
only showing top 20 rows



25/05/18 05:09:54 INFO  CodeGenerator:60 Code generated in 5.02242 ms


In [2]:
spark.stop()

25/05/18 05:10:02 INFO  SparkContext:60 SparkContext is stopping with exitCode 0.
25/05/18 05:10:02 INFO  SparkUI:60 Stopped Spark web UI at http://9fff83ac849a:4040
25/05/18 05:10:02 INFO  StandaloneSchedulerBackend:60 Shutting down all executors
25/05/18 05:10:02 INFO  StandaloneSchedulerBackend$StandaloneDriverEndpoint:60 Asking each executor to shut down
25/05/18 05:10:02 INFO  MapOutputTrackerMasterEndpoint:60 MapOutputTrackerMasterEndpoint stopped!
25/05/18 05:10:02 INFO  MemoryStore:60 MemoryStore cleared
25/05/18 05:10:02 INFO  BlockManager:60 BlockManager stopped
25/05/18 05:10:02 INFO  BlockManagerMaster:60 BlockManagerMaster stopped
25/05/18 05:10:02 INFO  OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:60 OutputCommitCoordinator stopped!
25/05/18 05:10:02 INFO  SparkContext:60 Successfully stopped SparkContext
