# **Simulating Streaming Data with a Thread Script**

In [None]:
!apt-get install openjdk-11-jdk -y
!wget -q https://downloads.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar xf spark-3.5.1-bin-hadoop3.tgz

!pip install -q findspark pyspark==3.5.1

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType

# Create Spark session
spark = SparkSession.builder.appName("Real Time Stock Data Pipeline").getOrCreate()

# Define stock data schema
schema = StructType([
    StructField("timestamp", TimestampType(), True),
    StructField("stock_symbol", StringType(), True),
    StructField("open", DoubleType(), True),
    StructField("high", DoubleType(), True),
    StructField("low", DoubleType(), True),
    StructField("close", DoubleType(), True),
    StructField("volume", IntegerType(), True)
])

In [3]:
import threading, time, random, csv, os
from datetime import datetime

# This global flag will control the loop
keep_running = True

def generate_stream():
    global keep_running
    stocks = ["AAPL", "MSFT", "GOOG", "AMZN"]
    os.makedirs("/content/streaming_data", exist_ok=True)
    i = 0
    while keep_running:  # the loop runs while this is True
        filename = f"/content/streaming_data/data_{i}.csv"
        with open(filename, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["timestamp", "stock_symbol", "open", "high", "low", "close", "volume"])
            now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            for s in stocks:
                base = random.uniform(100, 300)
                writer.writerow([
                    now, s,
                    round(base, 2),
                    round(base + random.uniform(0, 5), 2),
                    round(base - random.uniform(0, 5), 2),
                    round(base + random.uniform(-2, 2), 2),
                    random.randint(1000, 5000)
                ])
        i += 1
        #print(f"Wrote file {filename}")
        time.sleep(3)

print("Generator function ready.")

Generator function ready.


## **Start the Thread To Generate Streaming Stock Data**

This begins generating data files in the background every 3 seconds.

In [4]:
keep_running = True
thread = threading.Thread(target=generate_stream, daemon=True)
thread.start()
print("Streaming started...")

Streaming started...


## **Stop/Restart the Thread**

if we want to stop it the thread:

In [11]:
keep_running = False
print("Stopping stream...")

Stopping stream...


if we want to start again, just re-run:

In [21]:
keep_running = True
thread = threading.Thread(target=generate_stream, daemon=True)
thread.start()
print("Streaming restarted...")

Streaming restarted...


## **Reading Streaming Data**

In [6]:
df_stream = spark.readStream.format("csv") \
    .option("header", True) \
    .schema(schema) \
    .load("/content/streaming_data")

# Define Watermark to handle late data (delay tolerance)
df_wk = df_stream.withWatermark("timestamp", "10 minutes")

In [7]:
print("isStreaming:", df_stream.isStreaming)

isStreaming: True


In [8]:
df_stream.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- stock_symbol: string (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: integer (nullable = true)



## **Processing data in micro-batches**

In [None]:
from pyspark.sql.functions import col, to_timestamp, abs, when

# Drop rows with null essential values
df_clean = df_wk.na.drop(subset=["open", "high", "low", "close", "volume"])

# Ensure timestamp is proper
df_clean = df_clean.withColumn("timestamp", to_timestamp(col("timestamp")))

# Compute price change
df_clean = df_clean.withColumn("price_change", col("close") - col("open"))

# Detect anomalies (flag them)
df_clean = df_clean.withColumn("anomaly_flag", when(abs(col("price_change")) > 5, 1).otherwise(0))

# Static reference data (company names)
company_data = [
    ("AAPL", "Apple"),
    ("GOOG", "Google"),
    ("TSLA", "Tesla"),
    ("AMZN", "Amazon"),
    ("MSFT", "Microsoft")
]
company_ref = spark.createDataFrame(company_data, ["stock_symbol", "company_name"])

# Join streaming data with static reference
df_joined = df_clean.join(company_ref, on="stock_symbol", how="left")

# Write joined results to CSV files in micro-batches
query = df_joined.writeStream \
    .outputMode("append") \
    .format("csv") \
    .option("header", True) \
    .option("checkpointLocation", "/content/checkpoints/joined_stream/") \
    .option("path", "/content/processed_output/") \
    .trigger(processingTime="5 seconds") \
    .start()

query.awaitTermination()

In [10]:
stream_path = '/content/processed_output/part-00177-790aadb8-1241-4244-9b2c-9669a107cae2-c000.csv'
df_streaming_data = spark.read.csv(stream_path, header=True, inferSchema=True)
df_streaming_data.show()

+------------+-------------------+------+------+------+------+------+--------------------+------------+------------+
|stock_symbol|          timestamp|  open|  high|   low| close|volume|        price_change|anomaly_flag|company_name|
+------------+-------------------+------+------+------+------+------+--------------------+------------+------------+
|        MSFT|2025-11-09 11:29:38|258.27|258.98|255.24| 258.3|  3041| 0.03000000000002956|           0|   Microsoft|
|        MSFT|2025-11-09 11:29:41|265.33|269.95|264.41|264.79|  1084| -0.5399999999999636|           0|   Microsoft|
|        MSFT|2025-11-09 11:29:47|246.72|248.46|242.74|245.71|  2715|  -1.009999999999991|           0|   Microsoft|
|        MSFT|2025-11-09 11:29:35|169.39|171.99| 168.4|169.19|  2315|-0.19999999999998863|           0|   Microsoft|
|        MSFT|2025-11-09 11:29:44|102.76|102.98| 98.77|103.94|  3462|  1.1799999999999926|           0|   Microsoft|
+------------+-------------------+------+------+------+------+--