In [1]:
# 📘 SECTION 1: Introduction

from IPython.display import Markdown

Markdown("""
# Real-Time Streaming + Delta Lake in PySpark (IMDB Dataset)

This notebook demonstrates:
- Simulating streaming ingestion with IMDB rating data
- Writing to **Delta Lake**
- Using **Delta features** like time travel and vacuum
- Exploring **streaming performance optimization**
- Using **Spark Structured Streaming APIs**
""")


# Real-Time Streaming + Delta Lake in PySpark (IMDB Dataset)

This notebook demonstrates:
- Simulating streaming ingestion with IMDB rating data
- Writing to **Delta Lake**
- Using **Delta features** like time travel and vacuum
- Exploring **streaming performance optimization**
- Using **Spark Structured Streaming APIs**


In [12]:
import os
import sys
from pyspark.sql import SparkSession
from src.config.spark_config import get_spark_session

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.insert(0, PROJECT_ROOT)

# Start Spark session with proper configs
spark = get_spark_session(app_name="PySpark_Optimization_Demo")

25/06/20 02:37:41 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [13]:
name_path = os.path.join(PROJECT_ROOT, "data", "parquet_data", "name_basics.parquet")
ratings_path = os.path.join(PROJECT_ROOT, "data", "parquet_data", "title_ratings.parquet")

df_name = spark.read.parquet(name_path)
df_ratings = spark.read.parquet(ratings_path)

In [15]:
df_name.show(5, truncate=False)

+---------+---------------+---------+---------+-----------------+-------------------+------------------+----------------+----------------+----------------+----------------+---------------+---------------+---------------+-------------------+
|nconst   |primaryName    |birthYear|deathYear|primaryProfession|secondaryProfession|tertiaryProfession|knownForTitles_1|knownForTitles_2|knownForTitles_3|knownForTitles_4|full_name      |name_lower     |name_length    |ingested_at        |
+---------+---------------+---------+---------+-----------------+-------------------+------------------+----------------+----------------+----------------+----------------+---------------+---------------+---------------+-------------------+
|nm0000001|Fred Astaire   |1899     |1987     |actor            |miscellaneous      |producer          |tt0072308       |tt0050419       |tt0027125       |tt0031983       |Fred Astaire   |fred astaire   |Fred Astaire   |2025-06-20 02:07:51|
|nm0000002|Lauren Bacall  |1924     

In [16]:
from pyspark.sql.functions import col, when

# Replace '\N' with None (if not already handled at read time)
df_name = df_name.withColumn(
    "birthYear",
    when(col("birthYear") == "\\N", None).otherwise(col("birthYear"))
)

# Cast birthYear to integer
df_name = df_name.withColumn("birthYear", col("birthYear").cast("int"))

# Filter as required
df_filtered = df_name.filter((col("birthYear") >= 1960) & (col("primaryProfession").isNotNull()))
df_filtered.select("primaryName", "birthYear").show(5)

+----------------+---------+
|     primaryName|birthYear|
+----------------+---------+
|         Gong Li|     1965|
|       Brad Pitt|     1963|
|Gillian Anderson|     1968|
| Pamela Anderson|     1967|
|Jennifer Aniston|     1969|
+----------------+---------+
only showing top 5 rows


In [17]:
# EXPLAIN physical/optimized plans
df_join = df_filtered.join(df_ratings, df_filtered.knownForTitles_1 == df_ratings.tconst)
df_join.select("primaryName", "averageRating").explain(mode="formatted")


== Physical Plan ==
AdaptiveSparkPlan (9)
+- Project (8)
   +- BroadcastHashJoin Inner BuildRight (7)
      :- Project (3)
      :  +- Filter (2)
      :     +- Scan parquet  (1)
      +- BroadcastExchange (6)
         +- Filter (5)
            +- Scan parquet  (4)


(1) Scan parquet 
Output [4]: [primaryName#19, birthYear#20, primaryProfession#22, knownForTitles_1#25]
Batched: true
Location: InMemoryFileIndex [file:/Users/aryan/Desktop/project/data/parquet_data/name_basics.parquet]
PushedFilters: [IsNotNull(primaryProfession), IsNotNull(knownForTitles_1)]
ReadSchema: struct<primaryName:string,birthYear:string,primaryProfession:string,knownForTitles_1:string>

(2) Filter
Input [4]: [primaryName#19, birthYear#20, primaryProfession#22, knownForTitles_1#25]
Condition : ((CASE WHEN (birthYear#20 = \N) THEN false ELSE (cast(birthYear#20 as int) >= 1960) END AND isnotnull(primaryProfession#22)) AND isnotnull(knownForTitles_1#25))

(3) Project
Output [2]: [primaryName#19, knownForTitles_1#25]

In [26]:
# df_repart = df_join.repartition(8)
# df_coalesce = df_join.coalesce(2)

# print("🔁 Repartition Partitions:", df_repart.rdd.getNumPartitions())
# print("🧩 Coalesce Partitions:", df_coalesce.rdd.getNumPartitions())

In [25]:
# df_cached = df_join.cache()
# df_cached.count()  # triggers caching

In [23]:
df_name.printSchema()

root
 |-- nconst: string (nullable = true)
 |-- primaryName: string (nullable = true)
 |-- birthYear: integer (nullable = true)
 |-- deathYear: string (nullable = true)
 |-- primaryProfession: string (nullable = true)
 |-- secondaryProfession: string (nullable = true)
 |-- tertiaryProfession: string (nullable = true)
 |-- knownForTitles_1: string (nullable = true)
 |-- knownForTitles_2: string (nullable = true)
 |-- knownForTitles_3: string (nullable = true)
 |-- knownForTitles_4: string (nullable = true)
 |-- full_name: string (nullable = true)
 |-- name_lower: string (nullable = true)
 |-- name_length: string (nullable = true)
 |-- ingested_at: string (nullable = true)



In [24]:
from pyspark.sql.functions import broadcast

# Optional: Filter nulls from join key
df_name_filtered = df_name.filter(df_name.knownForTitles_1.isNotNull())

# Broadcast the smaller dataframe (df_name)
df_broadcast = df_ratings.join(
    broadcast(df_name_filtered.select("primaryName", "knownForTitles_1")),
    df_ratings.tconst == df_name_filtered.knownForTitles_1,
    how="left"
)

df_broadcast.select("tconst", "primaryName", "averageRating", "numVotes").show(5)

                                                                                

+---------+-------------+-------------+--------+
|   tconst|  primaryName|averageRating|numVotes|
+---------+-------------+-------------+--------+
|tt0000001|   Carmencita|          5.7|    2163|
|tt0000002|         NULL|          5.5|     296|
|tt0000003|Émile Reynaud|          6.5|    2217|
|tt0000004|         NULL|          5.3|     189|
|tt0000005|         NULL|          6.2|    2955|
+---------+-------------+-------------+--------+
only showing top 5 rows
