In [0]:
from pyspark.sql.types import StructType, StringType, DoubleType, LongType

schema = StructType() \
    .add("userId", StringType()) \
    .add("movieId", StringType()) \
    .add("rating", DoubleType()) \
    .add("timestamp", LongType())

ratings_autoload_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.schemaLocation", "/Volumes/workspace/imdb/stream_schema/")
    .schema(schema)
    .load("/Volumes/workspace/imdb/stream_input/")
)

In [0]:
spark.conf.set("spark.sql.shuffle.partitions", 4)
#extra config to speed up writes
#spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
#spark.conf.set("spark.executors.memory", "1g")

In [0]:
from pyspark.sql.types import StructType, StringType, IntegerType

# Step 1: Define the schema
schema = StructType() \
    .add("userId", StringType()) \
    .add("movieId", StringType()) \
    .add("rating", IntegerType()) \
    .add("timestamp", StringType())

# Step 2: Read using Auto Loader
ratings_autoload_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .schema(schema)
    .load("/Volumes/workspace/imdb/stream_input/")  # 👈 Your volume path
)

# Step 3: Write to Delta with explicit checkpoint
ratings_autoload_df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", "/Volumes/workspace/imdb/ratings_v3_checkpoints/") \
    .trigger(availableNow=True) \
    .start("/Volumes/workspace/imdb/ratings_v3/")  # 👈 Output path

<pyspark.sql.connect.streaming.query.StreamingQuery at 0xfff7413f1750>

In [0]:
%sql
SELECT * FROM delta.`/Volumes/workspace/imdb/ratings_v3/` LIMIT 10;

userId,movieId,rating,timestamp
tt0000001,5.7,2163,
tt0000002,5.5,296,
tt0000003,6.5,2217,
tt0000004,5.3,189,
tt0000005,6.2,2955,
tt0000006,5.0,213,
tt0000007,5.3,913,
tt0000008,5.4,2306,
tt0000009,5.4,226,
tt0000010,6.8,7993,


In [0]:
from pyspark.sql.types import StructType, StringType, IntegerType

# Define schema explicitly
schema = StructType() \
    .add("userId", StringType()) \
    .add("movieId", StringType()) \
    .add("rating", IntegerType()) \
    .add("timestamp", StringType())

# Set input and output volume paths
input_path = "/Volumes/workspace/imdb/stream_input/"
checkpoint_path = "/Volumes/workspace/imdb/ratings_v2"
output_path = "/Volumes/workspace/imdb/stream_output/ratings_delta/"

# Auto Loader streaming read
ratings_autoload_df = (
    spark.readStream
    .format("cloudFiles")                     # Auto Loader
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.inferColumnTypes", "false")  # We define schema manually
    .schema(schema)
    .load(input_path)
)



In [0]:
output_path = "/Volumes/workspace/imdb/ratings_v2"  # ✅ New path

ratings_autoload_df.writeStream \
    .format("delta") \
    .option("checkpointLocation", "/Volumes/workspace/imdb/ratings_v2") \
    .trigger(availableNow=True) \
    .start(output_path)
# Wait for it to finish (optional)


<pyspark.sql.connect.streaming.query.StreamingQuery at 0xfff7622bee50>

In [0]:
%sql
SELECT * FROM delta.`/Volumes/workspace/imdb/ratings_v2` LIMIT 10;

userId,movieId,rating,timestamp
tt0000001,5.7,2163,
tt0000002,5.5,296,
tt0000003,6.5,2217,
tt0000004,5.3,189,
tt0000005,6.2,2955,
tt0000006,5.0,213,
tt0000007,5.3,913,
tt0000008,5.4,2306,
tt0000009,5.4,226,
tt0000010,6.8,7993,
