In [0]:
# File location and type
file_location = "/FileStore/tables/student-3.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "false"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

_c0,_c1,_c2
ID,Name,Age
1,Rohit,24
2,Virat,23
3,Shreyas,34
4,Axar,20
5,Rahul,30


In [0]:
# Define file location (Ensure this is an absolute path)
file_location = "/FileStore/tables/student-3.csv"  # Change this to the correct path
# Define CSV options
file_type = "csv"
infer_schema = "true"  # Automatically infer schema
first_row_is_header = "true"  # First row contains column names
delimiter = ","  # Column separator
# Read CSV into DataFrame
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)
# Show DataFrame content
df.show()

+---+-------+---+
| ID|   Name|Age|
+---+-------+---+
|  1|  Rohit| 24|
|  2|  Virat| 23|
|  3|Shreyas| 34|
|  4|   Axar| 20|
|  5|  Rahul| 30|
+---+-------+---+



In [0]:
%sh
pip install 

In [0]:
display(dbutils.fs.ls("/FileStore/tables/"))

path,name,size,modificationTime
dbfs:/FileStore/tables/employee.csv,employee.csv,61,1742367610000
dbfs:/FileStore/tables/student-1.csv,student-1.csv,74,1742365972000
dbfs:/FileStore/tables/student-2.csv,student-2.csv,74,1742366386000
dbfs:/FileStore/tables/student-3.csv,student-3.csv,74,1742373717000
dbfs:/FileStore/tables/student.csv,student.csv,74,1742365861000
dbfs:/FileStore/tables/student_csv.xlsx,student_csv.xlsx,8518,1742365410000


In [0]:
# Create a view or table
temp_table_name = "Restaurant_customer_data_csv"
df.createOrReplaceTempView(temp_table_name)

In [0]:
import time
# Function to create a CSV file with streaming data
def create_streaming_file(file_num, rows=10000):
    csv_data = "EMPLOYEE_ID,FIRST_NAME,SALARY\n"  # CSV header
    for i in range(rows):
        csv_data += f"{file_num * 1000 + i},Employee_{i},{50000 + (i % 10000)}\n"
    file_path = f"/FileStore/tables/streaming_data_{int(time.time())}.csv"
    dbutils.fs.put(file_path, csv_data, True)
    print(f":open_file_folder: File {file_path} with {rows} rows added!")
# Generate multiple files to create a data spike
for i in range(3):  # 3 large files
    create_streaming_file(i, rows=5000)  # Each file has 5000 rows
    time.sleep(2)  # Small delay between file creations
print(":rocket: Streaming spike triggered! Check console for new data.")

Wrote 122810 bytes.
:open_file_folder: File /FileStore/tables/streaming_data_1742376128.csv with 5000 rows added!
Wrote 123920 bytes.
:open_file_folder: File /FileStore/tables/streaming_data_1742376130.csv with 5000 rows added!
Wrote 123920 bytes.
:open_file_folder: File /FileStore/tables/streaming_data_1742376132.csv with 5000 rows added!
:rocket: Streaming spike triggered! Check console for new data.


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
spark = SparkSession.builder.appName("StreamingDemo").getOrCreate()
# Define the schema explicitly
schema = StructType([
    StructField("EMPLOYEE_ID", StringType(), True),
    StructField("FIRST_NAME", StringType(), True),
    StructField("SALARY", IntegerType(), True)
])
# Read streaming data from a folder
df = spark.readStream.format("csv") \
    .option("header", True) \
    .schema(schema)  \
    .load("/FileStore/tables/")
# Apply a transformation
df_transformed = df.select("EMPLOYEE_ID", "FIRST_NAME", "SALARY")
# Write the output to console with a trigger interval of 5 seconds
query = df_transformed.writeStream.format("console") \
    .outputMode("append") \
    .trigger(processingTime="5 seconds") \
    .start()
query.awaitTermination()

In [0]:
query.stop()  # Stop the existing stream
# Restart the streaming query
query = df_transformed.writeStream.format("console") \
    .outputMode("append") \
    .trigger(processingTime="5 seconds") \
    .start()

In [0]:

display(dbutils.fs.ls("/FileStore/tables/"))

path,name,size,modificationTime
dbfs:/FileStore/tables/employee.csv,employee.csv,61,1742367610000
dbfs:/FileStore/tables/streaming_data.csv,streaming_data.csv,64,1742376194000
dbfs:/FileStore/tables/streaming_data_1742376001.csv,streaming_data_1742376001.csv,122810,1742376002000
dbfs:/FileStore/tables/streaming_data_1742376003.csv,streaming_data_1742376003.csv,123920,1742376004000
dbfs:/FileStore/tables/streaming_data_1742376005.csv,streaming_data_1742376005.csv,123920,1742376006000
dbfs:/FileStore/tables/streaming_data_1742376128.csv,streaming_data_1742376128.csv,122810,1742376129000
dbfs:/FileStore/tables/streaming_data_1742376130.csv,streaming_data_1742376130.csv,123920,1742376131000
dbfs:/FileStore/tables/streaming_data_1742376132.csv,streaming_data_1742376132.csv,123920,1742376133000
dbfs:/FileStore/tables/student-1.csv,student-1.csv,74,1742365972000
dbfs:/FileStore/tables/student-2.csv,student-2.csv,74,1742366386000


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("StreamingDemo").getOrCreate()

# Define the schema explicitly (added a timestamp column for watermarking)
schema = StructType([
    StructField("EMPLOYEE_ID", StringType(), True),
    StructField("FIRST_NAME", StringType(), True),
    StructField("SALARY", IntegerType(), True),
    StructField("EVENT_TIMESTAMP", TimestampType(), True)  # Add a timestamp column
])

# Read streaming data from a folder
df = spark.readStream.format("csv") \
    .option("header", True) \
    .schema(schema) \
    .load("/FileStore/tables/")

# Apply watermarking to manage late data (assuming the 'EVENT_TIMESTAMP' is the event time)
df_with_watermark = df.withWatermark("EVENT_TIMESTAMP", "1 hour")  # Allow late data up to 1 hour

# Apply a transformation (you can adjust this as needed)
df_transformed = df_with_watermark.select("EMPLOYEE_ID", "FIRST_NAME", "SALARY", "EVENT_TIMESTAMP")

# Write the output to console with a trigger interval of 5 seconds
query = df_transformed.writeStream.format("console") \
    .outputMode("append") \
    .trigger(processingTime="5 seconds") \
    .start()

query.awaitTermination()
