In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, StringType, StructField

In [0]:
# Define the schema of the incoming data
schema = StructType([
    StructField("EMPLOYEE_ID", IntegerType(), True),
    StructField("FIRST_NAME", StringType(), True),
    StructField("SALARY", IntegerType(), True)
])

In [0]:
# Directory where files will be read from
inputPath = "dbfs:/mnt/streaming_data"


In [0]:
# Read stream using Autoloader
df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.schemaLocation", "dbfs:/mnt/streaming_data/schema")
    .schema(schema)
    .load(inputPath)
)

In [0]:
# Basic transformation
transformed_df = df.withColumn("SALARY_IN_K", col("SALARY") / 1000)


In [0]:
# Write stream to console (for debugging)
query = (
    transformed_df.writeStream
    .outputMode("append")
    .format("console")
    .option("truncate", "false")
    .start()
)
query.awaitTermination()

In [0]:
query.stop()  # Stop the existing stream
# Restart the streaming query
query = transformed_df.writeStream.format("console") \
    .outputMode("append") \
    .option("truncate", "false")\
    .start()

In [0]:
import time

# Function to create a CSV file with streaming data
def create_streaming_file(file_num, rows=5000):
    csv_data = "JOB_ID,FIRST_NAME,SALARY\n"  # CSV header
    for i in range(rows):
        csv_data += f"{file_num * 1000 + i},Employee_{i},{50000 + (i % 10000)}\n"
    
    file_path = f"/mnt/streaming_data/streaming_data_{int(time.time())}.csv"
    dbutils.fs.put(file_path, csv_data, True)
    print(f"📂 File {file_path} with {rows} rows added!")

# Generate multiple files to create a data spike
for i in range(3):  # Creating 3 files
    create_streaming_file(i, rows=5000)
    time.sleep(2)  # Adding a small delay between file creations

print("🚀 Streaming files have been created in /mnt/streaming_data_Auto!")


Wrote 122805 bytes.
📂 File /mnt/streaming_data/streaming_data_1742472925.csv with 5000 rows added!
Wrote 123915 bytes.
📂 File /mnt/streaming_data/streaming_data_1742472927.csv with 5000 rows added!
Wrote 123915 bytes.
📂 File /mnt/streaming_data/streaming_data_1742472929.csv with 5000 rows added!
🚀 Streaming files have been created in /mnt/streaming_data_Auto!
