# Batch vs. Streaming

In [1]:
## Let's first use an API to get the data
import os
import json
import requests
import time

url = "https://api.alternative.me/v2/ticker/bitcoin/?convert=USD" 

In [2]:
def initiate_json(url):
    sess = requests.Session()
    with sess.get(url, headers = None, stream = True) as response:
        with open('data.json', 'w+') as f:
            json.dump(response.json(), f)

In [3]:
def get_stream(url):
    sess = requests.Session()
    with sess.get(url, headers = None, stream = True) as response:
        with open('data.json', 'r+') as f:
            data = json.load(f)
            if type(data) is list:
                data = data
            else:
                data = [data]
        data.append(response.json())
        with open('data.json', 'w+') as f:
            json.dump(data, f)

These two functions allow us to create a JSON file (Initialize the file), and then we can update every 10 seconds. Let's start with a batch: let's populate the JSON file with 10 min of data

In [None]:
initiate_json(url)
for i in range(60):
    get_stream(url)
    time.sleep(10)

# Batch

As described before, we have been doing batch processing over the last 4 weeks. Let's run Batch, which will be very similar to before. We want to extract the BTC prices and the total supply

In [17]:
## Let's start by setting up the imports and the create the Spark Session
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, from_json
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, TimestampType

spark = SparkSession\
        .builder\
        .appName("BatchJob")\
        .master("local[*]")\
        .config("spark.ui.port","42229")\
        .getOrCreate()

In [9]:
## Let's work now with Schemas, the first step is to create a Schema to use for our data
def btc_schema():
    return StructType([
        StructField("Date", TimestampType(), True),
        StructField("Price", FloatType(), True),
        StructField("Supply", IntegerType(), True),
    ])


## Let's read the data from the json file
raw_events = spark \
    .read \
    .json('file:///media/data/data.json')


raw_events.show()

                                                                                

+--------------------+--------------------+
|                data|            metadata|
+--------------------+--------------------+
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 3106, 1653...|
|{{19046825, 1, 16...|{null, 310

As we can observe, the data has a weird structure, where we have both data and metadata. We need to unroll the JSON file to extract the correct information

In [11]:
btc_eda = raw_events\
        .select(raw_events.data)

btc_table.take(1)

[Row(data=Row(1=Row(circulating_supply=19046825, id=1, last_updated=1653265752, max_supply=21000000, name='Bitcoin', quotes=Row(USD=Row(market_cap=578216982523, percent_change_1h=-0.460638268084277, percent_change_24h=2.73122435604924, percent_change_7d=-3.21842635088867, percentage_change_1h=-0.460638268084277, percentage_change_24h=2.73122435604924, percentage_change_7d=-3.21842635088867, price=30311.0, volume_24h=17443986056)), rank=1, symbol='BTC', total_supply=19046825, website_slug='bitcoin')))]

Using the `take()` method, we are able to understand exactly the data. We will now extract the information needed

In [27]:
btc_table = raw_events\
        .select(raw_events['data']['1']['last_updated'].cast("timestamp").alias('Date'), 
                raw_events['data']['1']['quotes']['USD']['price'].cast("float").alias('Price'), 
                raw_events['data']['1']['total_supply'].cast("int").alias('Supply'))

btc_table.show(60)

+-------------------+-------+--------+
|               Date|  Price|  Supply|
+-------------------+-------+--------+
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|30311.0|19046825|
|2022-05-23 00:29:12|3031

In [30]:
## Now we can write the table as a parquet file
btc_table \
    .write \
    .mode('overwrite') \
    .parquet('file:///media/data/btc.parquet')

## Streaming

So, what changes if we are streaming? Data is coming in a stream process! I will create a script to populate a new JSON file without a stop to observe what changes in stream

In [None]:
## Get the schema of the file
raw_schema = spark.read.json("file:///media/data_stream.json").schema

raw_events = spark \
    .readStream \
    .format('json') \
    .schema(raw_schema) \
    .json('file:///media/data_stream*.json')

## Create the data that we need
btc_table = raw_events\
        .select(raw_events['data']['1']['last_updated'].cast("timestamp").alias('Date'), 
                raw_events['data']['1']['quotes']['USD']['price'].cast("float").alias('Price'), 
                raw_events['data']['1']['total_supply'].cast("int").alias('Supply'))

btc_table.printSchema()

## Writing the data in a stream job
sink = btc_table \
        .writeStream \
        .queryName("btc_price") \
        .format('memory') \
        .outputMode("append") \
        .start()

while True:
    spark.sql("select last(Price) as Current_Price, avg(Price) as Avg_Price from btc_price").show()
    time.sleep(10)

root
 |-- Date: timestamp (nullable = true)
 |-- Price: float (nullable = true)
 |-- Supply: integer (nullable = true)



22/05/23 04:31:58 WARN org.apache.spark.sql.streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-0d77a75b-eee0-4bac-bbe5-5373931a1bb7. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/05/23 04:31:58 WARN org.apache.spark.sql.streaming.StreamingQueryManager: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


+-------------+---------+
|Current_Price|Avg_Price|
+-------------+---------+
|         null|     null|
+-------------+---------+

+-------------+---------+
|Current_Price|Avg_Price|
+-------------+---------+
|      30187.0|  30177.2|
+-------------+---------+

+-------------+---------+
|Current_Price|Avg_Price|
+-------------+---------+
|      30187.0|  30177.2|
+-------------+---------+

+-------------+---------+
|Current_Price|Avg_Price|
+-------------+---------+
|      30187.0|  30177.2|
+-------------+---------+

+-------------+---------+
|Current_Price|Avg_Price|
+-------------+---------+
|      30187.0|  30177.2|
+-------------+---------+

+-------------+---------+
|Current_Price|Avg_Price|
+-------------+---------+
|      30187.0|  30177.2|
+-------------+---------+

+-------------+---------+
|Current_Price|Avg_Price|
+-------------+---------+
|      30187.0|  30177.2|
+-------------+---------+

+-------------+---------+
|Current_Price|Avg_Price|
+-------------+---------+
|    

KeyboardInterrupt: 