In [1]:
from multiprocessing import Process
from pyspark.sql import SparkSession
from bitcoin_utils import (
    fetch_bitcoin_price,
    launch_writer,
    launch_spark_stream
    )
import time

In [2]:
# --- CONFIGURATION ---
DATA_DIR = "data_demo"
INTERVAL = 15               # Interval (in seconds) between price fetches
NUM_POINTS = 20             # Number of records the writer will attempt
WINDOW_SIZES = ["1 minute"]  # Moving average window sizes
SLIDE_INTERVAL = "15 seconds"   # Sliding interval for windowed aggregations
WATERMARK = "30 seconds"          # Watermark to tolerate late-arriving data
DELAY_BEFORE_SPARK = 60        # Delay (in seconds) before Spark starts
WRITER_DURATION = 420         # Total max run time for writer (in seconds)
SPARK_BUFFER = 60              # Extra buffer time to allow Spark to finish writing

In [3]:
def manage_pipeline():
    """
    Manages the entire end-to-end pipeline:
    - Starts the writer process to fetch and save Bitcoin prices
    - Launches Spark streaming queries for multiple window sizes
    - Waits for writer completion and allows Spark to finalize batches
    - Gracefully stops all Spark streams and terminates the session
    """
    print("[MAIN] Launching writer process...")
    writer_process = Process(target=launch_writer, args=(DATA_DIR, INTERVAL, NUM_POINTS))
    writer_process.start()
    print(f"[INFO] Writer PID: {writer_process.pid}")

    # Launch Spark streamers
    queries, spark = launch_spark_stream(DELAY_BEFORE_SPARK, DATA_DIR, WINDOW_SIZES, WATERMARK, SLIDE_INTERVAL)

    try:
        print(f"[MAIN] Streaming for {WRITER_DURATION} seconds...")
        time.sleep(WRITER_DURATION)
    except KeyboardInterrupt:
        print("[MAIN] Interrupted! Cleaning up early...")

    # Terminate writer
    print("[MAIN] Terminating writer process...")
    writer_process.terminate()
    writer_process.join()

    # Wait a buffer period for Spark to flush final batches
    print(f"[MAIN] Waiting {SPARK_BUFFER}s for Spark to finalize batches...")
    time.sleep(SPARK_BUFFER)

    # Gracefully stop Spark streaming queries
    print("[MAIN] Stopping all Spark streams...")
    for q in queries:
        try:
            q.awaitTermination(timeout=5)
            q.stop()
            q.awaitTermination()
        except Exception as e:
            print(f"[WARN] Error while stopping stream: {e}")

    # Stop Spark session
    if spark:
        try:
            spark.stop()
            print("[MAIN] Spark session stopped.")
        except Exception as e:
            print(f"[WARN] Error while stopping Spark session: {e}")

    print("[MAIN] Pipeline complete.")

In [4]:
# Fetch a single Bitcoin price with timestamp
fetch_bitcoin_price(verbose=True)

[2025-05-18T16:41:39.604777] Price: $105389.00


{'timestamp': '2025-05-18T16:41:39.604777', 'price': 105389}

In [5]:
# --- MAIN ENTRY POINT ---
manage_pipeline()

[MAIN] Launching writer process...
[Writer] Starting data collection...
[Writer] Record 1 saved at 2025-05-18T16:41:39.676892 → $105389
[INFO] Writer PID: 52
[Spark] Waiting 60s to let writer populate files...
[Writer] Record 2 saved at 2025-05-18T16:41:54.754165 → $105389
[Writer] Record 3 saved at 2025-05-18T16:42:09.854908 → $105393
[Writer] Record 4 saved at 2025-05-18T16:42:24.940315 → $105393
[Spark] Starting Spark session...
[Writer] Record 5 saved at 2025-05-18T16:42:40.004295 → $105393


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/18 16:42:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


[Spark] Starting stream for WINDOW_SIZE=1 minute → Output: moving_avg_output_1_minute


25/05/18 16:42:41 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


[Spark] All streaming queries started (1 total).
[MAIN] Streaming for 420 seconds...


                                                                                

[Writer] Record 6 saved at 2025-05-18T16:42:55.056188 → $105393


                                                                                

[Writer] Record 7 saved at 2025-05-18T16:43:10.130004 → $105399


                                                                                

[Writer] Record 8 saved at 2025-05-18T16:43:25.209651 → $105399


                                                                                

[Writer] Record 9 saved at 2025-05-18T16:43:40.273174 → $105399


                                                                                

[Writer] Record 10 saved at 2025-05-18T16:43:55.360114 → $105399


                                                                                

[Writer] Record 11 saved at 2025-05-18T16:44:10.439499 → $105403


                                                                                

[Writer] Record 12 saved at 2025-05-18T16:44:25.508935 → $105399


                                                                                

[Writer] Record 13 saved at 2025-05-18T16:44:40.579071 → $105403


                                                                                

[Writer] Record 14 saved at 2025-05-18T16:44:55.644151 → $105403


                                                                                

[Writer] Record 15 saved at 2025-05-18T16:45:10.709517 → $105408


                                                                                

[Writer] Record 16 saved at 2025-05-18T16:45:25.793258 → $105408


                                                                                

[Writer] Record 17 saved at 2025-05-18T16:45:40.888245 → $105408


                                                                                

[Writer] Record 18 saved at 2025-05-18T16:45:55.958366 → $105408


                                                                                

[Writer] Record 19 saved at 2025-05-18T16:46:11.029481 → $105411


                                                                                

[Writer] Record 20 saved at 2025-05-18T16:46:26.126480 → $105411


                                                                                

[MAIN] Terminating writer process...
[MAIN] Waiting 60s for Spark to finalize batches...
[MAIN] Stopping all Spark streams...
[MAIN] Spark session stopped.
[MAIN] Pipeline complete.


25/05/18 16:51:43 WARN StateStore: Error running maintenance thread
java.lang.IllegalStateException: SparkEnv not active, cannot do maintenance on StateStores
	at org.apache.spark.sql.execution.streaming.state.StateStore$.doMaintenance(StateStore.scala:632)
	at org.apache.spark.sql.execution.streaming.state.StateStore$.$anonfun$startMaintenanceIfNeeded$1(StateStore.scala:610)
	at org.apache.spark.sql.execution.streaming.state.StateStore$MaintenanceTask$$anon$1.run(StateStore.scala:453)
	at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:539)
	at java.base/java.util.concurrent.FutureTask.runAndReset(FutureTask.java:305)
	at java.base/java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:305)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.