In [1]:

import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"), "../../")))

from config import settings
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim
import os
from src.config.spark_config import get_spark_session


from pyspark.sql import SparkSession
# Optional: custom tuning for this session
custom_tuning = {
    "spark.executor.memory": "6g",
    "spark.driver.memory": "4g",
    "spark.executor.cores": "4",
    "spark.sql.shuffle.partitions": "8",
    "spark.default.parallelism": "8",
    "spark.sql.adaptive.enabled": "true",
    "spark.sql.autoBroadcastJoinThreshold": "-1"
}

# Step 1: Start Spark session
from src.config.spark_config import get_spark_session
spark = get_spark_session(app_name="Read_Parquet_Tuning", custom_config=custom_tuning)
# Step 2: Define Parquet folder path
parquet_base = os.path.join(settings.BASE_DIR, "data", "parquet_data")

# Step 3: Map your Parquet files
parquet_files = {
    "title_basics": os.path.join(parquet_base, "title_basics.parquet"),
    "title_crew": os.path.join(parquet_base, "title_crew.parquet"),
    "title_episode": os.path.join(parquet_base, "title_episode.parquet"),
    "title_akas": os.path.join(parquet_base, "title_akas.parquet")
}


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/23 09:36:44 WARN Utils: Your hostname, Nagendras-MacBook-Pro.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.17 instead (on interface en0)
25/06/23 09:36:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/06/23 09:36:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
  import pkg_resources
25/06/23 09:36:45 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 65135)
Traceback (most recent call last):
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/socketserver.py", line 747, in __init__
    self.handle()
  File "/Users/aryan/Desktop/project/venv/lib/python3

In [2]:
from pyspark.sql.functions import col

# Define which column to select for each file
columns_to_display = {
    "title_basics": "primaryTitle",
    "title_crew": "directors",
    "title_episode": "seasonNumber",
    "title_akas": "title"
}

# Load and display the specified column for each file
for name, path in parquet_files.items():
    print(f"📄 Reading file: {name}")
    column = columns_to_display[name]

    df = spark.read.parquet(path)
    
    if column in df.columns:
        df.select(col(column)).show(5)
    else:
        print(f"❌ Column '{column}' not found in {name}")

📄 Reading file: title_basics


+--------------------+
|        primaryTitle|
+--------------------+
|          Carmencita|
|Le clown et ses c...|
|        Poor Pierrot|
|         Un bon bock|
|    Blacksmith Scene|
+--------------------+
only showing top 5 rows
📄 Reading file: title_crew
+---------+
|directors|
+---------+
|nm5019406|
|nm6236402|
|nm6568829|
|nm0732218|
|nm2005799|
+---------+
only showing top 5 rows
📄 Reading file: title_episode
+------------+
|seasonNumber|
+------------+
|          20|
|          20|
|           6|
|           7|
|           1|
+------------+
only showing top 5 rows
📄 Reading file: title_akas
+--------------------+
|               title|
+--------------------+
|          Carmencita|
|          Carmencita|
|          Carmencita|
|Carmencita - span...|
|          Καρμενσίτα|
+--------------------+
only showing top 5 rows


In [5]:
from pyspark.sql.functions import col
#filter
path = parquet_files["title_basics"]
df = spark.read.parquet(path)

# ✅ Basic Query - Pushdown Filter Example
basic_filtered = df.filter(col("titleType") == "movie")
print("🔍 BASIC FILTER - Only movies")
basic_filtered.explain(mode="formatted")


🔍 BASIC FILTER - Only movies
== Physical Plan ==
* Filter (3)
+- * ColumnarToRow (2)
   +- Scan parquet  (1)


(1) Scan parquet 
Output [11]: [tconst#141, titleType#142, primaryTitle#143, originalTitle#144, isAdult#145, startYear#146, endYear#147, runtimeMinutes#148, genre_1#149, genre_2#150, ingested_at#151]
Batched: true
Location: InMemoryFileIndex [file:/Users/aryan/Desktop/project/data/parquet_data/title_basics.parquet]
PushedFilters: [IsNotNull(titleType), EqualTo(titleType,movie)]
ReadSchema: struct<tconst:string,titleType:string,primaryTitle:string,originalTitle:string,isAdult:string,startYear:string,endYear:string,runtimeMinutes:string,genre_1:string,genre_2:string,ingested_at:string>

(2) ColumnarToRow [codegen id : 1]
Input [11]: [tconst#141, titleType#142, primaryTitle#143, originalTitle#144, isAdult#145, startYear#146, endYear#147, runtimeMinutes#148, genre_1#149, genre_2#150, ingested_at#151]

(3) Filter [codegen id : 1]
Input [11]: [tconst#141, titleType#142, primaryTitle

In [6]:
basic_filtered.show(20, truncate=False)

+---------+---------+-----------------------------------------------+-----------------------------------------------+-------+---------+-------+--------------+-----------+---------+-------------------+
|tconst   |titleType|primaryTitle                                   |originalTitle                                  |isAdult|startYear|endYear|runtimeMinutes|genre_1    |genre_2  |ingested_at        |
+---------+---------+-----------------------------------------------+-----------------------------------------------+-------+---------+-------+--------------+-----------+---------+-------------------+
|tt0000009|movie    |Miss Jerry                                     |Miss Jerry                                     |0      |1894     |\N     |45            |Romance    |NULL     |2025-06-23 09:17:33|
|tt0000147|movie    |The Corbett-Fitzsimmons Fight                  |The Corbett-Fitzsimmons Fight                  |0      |1897     |\N     |100           |Documentary|News     |2025-06-23 09:17

In [7]:
#level 2
# ✅ Advanced Query - Combined Pushdown Filter
advanced_filtered = df.filter(
    (col("titleType") == "movie") &
    (col("startYear") == "2024") &
    (col("genre_1") == "Drama")
)

print("🔍 ADVANCED FILTER - Movies from 2024 in Drama")
advanced_filtered.explain(mode="formatted")
advanced_filtered.show(1, truncate=False)

🔍 ADVANCED FILTER - Movies from 2024 in Drama
== Physical Plan ==
* Filter (3)
+- * ColumnarToRow (2)
   +- Scan parquet  (1)


(1) Scan parquet 
Output [11]: [tconst#141, titleType#142, primaryTitle#143, originalTitle#144, isAdult#145, startYear#146, endYear#147, runtimeMinutes#148, genre_1#149, genre_2#150, ingested_at#151]
Batched: true
Location: InMemoryFileIndex [file:/Users/aryan/Desktop/project/data/parquet_data/title_basics.parquet]
PushedFilters: [IsNotNull(titleType), IsNotNull(startYear), IsNotNull(genre_1), EqualTo(titleType,movie), EqualTo(startYear,2024), EqualTo(genre_1,Drama)]
ReadSchema: struct<tconst:string,titleType:string,primaryTitle:string,originalTitle:string,isAdult:string,startYear:string,endYear:string,runtimeMinutes:string,genre_1:string,genre_2:string,ingested_at:string>

(2) ColumnarToRow [codegen id : 1]
Input [11]: [tconst#141, titleType#142, primaryTitle#143, originalTitle#144, isAdult#145, startYear#146, endYear#147, runtimeMinutes#148, genre_1#149, gen

In [12]:
df_no_pushdown = spark.read.parquet(path)
df_filtered = df_no_pushdown.filter(col("titleType") == "movie")

df_filtered.explain(mode="formatted")
df_filtered.show(1, truncate=False)

== Physical Plan ==
* Filter (3)
+- * ColumnarToRow (2)
   +- Scan parquet  (1)


(1) Scan parquet 
Output [11]: [tconst#650, titleType#651, primaryTitle#652, originalTitle#653, isAdult#654, startYear#655, endYear#656, runtimeMinutes#657, genre_1#658, genre_2#659, ingested_at#660]
Batched: true
Location: InMemoryFileIndex [file:/Users/aryan/Desktop/project/data/parquet_data/title_basics.parquet]
PushedFilters: [IsNotNull(titleType), EqualTo(titleType,movie)]
ReadSchema: struct<tconst:string,titleType:string,primaryTitle:string,originalTitle:string,isAdult:string,startYear:string,endYear:string,runtimeMinutes:string,genre_1:string,genre_2:string,ingested_at:string>

(2) ColumnarToRow [codegen id : 1]
Input [11]: [tconst#650, titleType#651, primaryTitle#652, originalTitle#653, isAdult#654, startYear#655, endYear#656, runtimeMinutes#657, genre_1#658, genre_2#659, ingested_at#660]

(3) Filter [codegen id : 1]
Input [11]: [tconst#650, titleType#651, primaryTitle#652, originalTitle#653, isAd

In [None]:
# # User Code (DataFrame API)

# Logical Plan (What to do)
# >>>Optimized Logical Plan (Simplified version)
#         ↓
# >>>Physical Plan (How to do it)
#         ↓
# >>> Executed by Spark Engine

In [8]:
from pyspark.sql.functions import col

# Read Parquet files
title_basics = spark.read.parquet(parquet_files["title_basics"])
title_crew = spark.read.parquet(parquet_files["title_crew"])

# Apply early filter to reduce data before join (pushdown)
filtered_basics = title_basics.filter(col("titleType") == "movie")

# Perform join
joined_df = filtered_basics.join(title_crew, on="tconst", how="inner")

# Select and display a few useful columns
result_df = joined_df.select("tconst", "primaryTitle", "directors", "writers")

# Show 5 records
result_df.show(5, truncate=False)

# Print physical plan
result_df.explain(mode="formatted")



+---------+--------------------------------+---------+---------+
|tconst   |primaryTitle                    |directors|writers  |
+---------+--------------------------------+---------+---------+
|tt0000591|The Prodigal Son                |nm0141150|nm0141150|
|tt0000867|Fiesta de toros                 |nm0023107|\N       |
|tt0000868|Fiestas de Santa Lucía - Belenes|nm0005717|nm0005717|
|tt0000886|Hamlet                          |nm0099901|nm0000636|
|tt0001007|La primera y segunda casetas    |nm0185426|\N       |
+---------+--------------------------------+---------+---------+
only showing top 5 rows
== Physical Plan ==
AdaptiveSparkPlan (12)
+- Project (11)
   +- SortMergeJoin Inner (10)
      :- Sort (5)
      :  +- Exchange (4)
      :     +- Project (3)
      :        +- Filter (2)
      :           +- Scan parquet  (1)
      +- Sort (9)
         +- Exchange (8)
            +- Filter (7)
               +- Scan parquet  (6)


(1) Scan parquet 
Output [3]: [tconst#221, titleType#222

                                                                                

In [None]:
# Filter early to reduce join data size.
# 	•	Avoid joins on huge unsorted datasets without bucketing/salting.
# 	•	Use broadcast() on smaller DF to trigger Broadcast Join if needed.


In [9]:
from pyspark.sql.functions import split, explode, trim

# Step 1: Load data
title_basics = spark.read.parquet(parquet_files["title_basics"])
title_crew = spark.read.parquet(parquet_files["title_crew"])
name_basics = spark.read.parquet(os.path.join(parquet_base, "name_basics.parquet"))

# Step 2: Filter only movies
movies = title_basics.filter(col("titleType") == "movie")

# Step 3: Join title_basics + title_crew
crew_joined = movies.join(title_crew, on="tconst", how="inner")

# Step 4: Explode director IDs (split on comma)
directors_df = crew_joined \
    .withColumn("director_id", explode(split(col("directors"), ","))) \
    .drop("writers")  # Optional

# Step 5: Join with name_basics to get actual names
final_df = directors_df.join(name_basics, directors_df.director_id == name_basics.nconst, how="left") \
    .select("tconst", "primaryTitle", "director_id", "primaryName") \
    .withColumnRenamed("primaryName", "director_name")

# Step 6: Show
final_df.show(5, truncate=False)

                                                                                

+---------+-----------------------------+-----------+-----------------------------------+
|tconst   |primaryTitle                 |director_id|director_name                      |
+---------+-----------------------------+-----------+-----------------------------------+
|tt0000675|Don Quijote                  |nm0194088  |Narciso Cuyàs                      |
|tt0000147|The Corbett-Fitzsimmons Fight|nm0714557  |Enoch J. Rector                    |
|tt0001422|Trail to the West            |nm0001908  |Gilbert M. 'Broncho Billy' Anderson|
|tt0000838|A Cultura do Cacau           |nm0017074  |Ernesto de Albuquerque             |
|tt0000850|Los dos hermanos             |nm0063413  |Ricardo de Baños                   |
+---------+-----------------------------+-----------+-----------------------------------+
only showing top 5 rows


In [10]:
final_df.explain(mode="formatted")

== Physical Plan ==
AdaptiveSparkPlan (21)
+- Project (20)
   +- SortMergeJoin LeftOuter (19)
      :- Sort (14)
      :  +- Exchange (13)
      :     +- Generate (12)
      :        +- Project (11)
      :           +- SortMergeJoin Inner (10)
      :              :- Sort (5)
      :              :  +- Exchange (4)
      :              :     +- Project (3)
      :              :        +- Filter (2)
      :              :           +- Scan parquet  (1)
      :              +- Sort (9)
      :                 +- Exchange (8)
      :                    +- Filter (7)
      :                       +- Scan parquet  (6)
      +- Sort (18)
         +- Exchange (17)
            +- Filter (16)
               +- Scan parquet  (15)


(1) Scan parquet 
Output [3]: [tconst#251, titleType#252, primaryTitle#253]
Batched: true
Location: InMemoryFileIndex [file:/Users/aryan/Desktop/project/data/parquet_data/title_basics.parquet]
PushedFilters: [IsNotNull(titleType), EqualTo(titleType,movie), IsNotNull

In [16]:
df = title_basics.filter(col("titleType") == "movie")
df.explain(mode="formatted")

== Physical Plan ==
* Filter (3)
+- * ColumnarToRow (2)
   +- Scan parquet  (1)


(1) Scan parquet 
Output [11]: [tconst#741, titleType#742, primaryTitle#743, originalTitle#744, isAdult#745, startYear#746, endYear#747, runtimeMinutes#748, genre_1#749, genre_2#750, ingested_at#751]
Batched: true
Location: InMemoryFileIndex [file:/Users/aryan/Desktop/project/data/parquet_data/title_basics.parquet]
PushedFilters: [IsNotNull(titleType), EqualTo(titleType,movie)]
ReadSchema: struct<tconst:string,titleType:string,primaryTitle:string,originalTitle:string,isAdult:string,startYear:string,endYear:string,runtimeMinutes:string,genre_1:string,genre_2:string,ingested_at:string>

(2) ColumnarToRow [codegen id : 1]
Input [11]: [tconst#741, titleType#742, primaryTitle#743, originalTitle#744, isAdult#745, startYear#746, endYear#747, runtimeMinutes#748, genre_1#749, genre_2#750, ingested_at#751]

(3) Filter [codegen id : 1]
Input [11]: [tconst#741, titleType#742, primaryTitle#743, originalTitle#744, isAd

In [11]:
df = title_basics.filter(col("titleType") == "movie")

# Logical plan (developer-friendly)
print(df._jdf.queryExecution().logical())

# Optimized plan (after Catalyst optimizations)
print(df._jdf.queryExecution().optimizedPlan())

# Physical plan (actual execution)
print(df._jdf.queryExecution().executedPlan())

'Filter '`=`('titleType, movie)
+- Relation [tconst#251,titleType#252,primaryTitle#253,originalTitle#254,isAdult#255,startYear#256,endYear#257,runtimeMinutes#258,genre_1#259,genre_2#260,ingested_at#261] parquet

Filter (isnotnull(titleType#252) AND (titleType#252 = movie))
+- Relation [tconst#251,titleType#252,primaryTitle#253,originalTitle#254,isAdult#255,startYear#256,endYear#257,runtimeMinutes#258,genre_1#259,genre_2#260,ingested_at#261] parquet

*(1) Filter (isnotnull(titleType#252) AND (titleType#252 = movie))
+- *(1) ColumnarToRow
   +- FileScan parquet [tconst#251,titleType#252,primaryTitle#253,originalTitle#254,isAdult#255,startYear#256,endYear#257,runtimeMinutes#258,genre_1#259,genre_2#260,ingested_at#261] Batched: true, DataFilters: [isnotnull(titleType#252), (titleType#252 = movie)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/Users/aryan/Desktop/project/data/parquet_data/title_basics.parquet], PartitionFilters: [], PushedFilters: [IsNotNull(titleType), Equal

In [12]:
from pyspark.sql.functions import col
# Get All Movie Titles with Their Directors & Writers
title_basics = spark.read.parquet(parquet_files["title_basics"])
title_crew = spark.read.parquet(parquet_files["title_crew"])

In [13]:
movies_df = title_basics.filter(col("titleType") == "movie").select("tconst", "primaryTitle")

In [14]:
movie_crew_df = movies_df.join(
    title_crew,
    on="tconst",
    how="inner"
).select("tconst", "primaryTitle", "directors", "writers")

In [15]:
# EXPLAIN plan
movie_crew_df.explain(mode="formatted")

# Query Execution Details
print("\n🧠 Logical Plan:")
print(movie_crew_df._jdf.queryExecution().logical())

print("\n🛠 Optimized Plan:")
print(movie_crew_df._jdf.queryExecution().optimizedPlan())

print("\n⚙️ Physical Plan:")
print(movie_crew_df._jdf.queryExecution().executedPlan())

== Physical Plan ==
AdaptiveSparkPlan (12)
+- Project (11)
   +- SortMergeJoin Inner (10)
      :- Sort (5)
      :  +- Exchange (4)
      :     +- Project (3)
      :        +- Filter (2)
      :           +- Scan parquet  (1)
      +- Sort (9)
         +- Exchange (8)
            +- Filter (7)
               +- Scan parquet  (6)


(1) Scan parquet 
Output [3]: [tconst#300, titleType#301, primaryTitle#302]
Batched: true
Location: InMemoryFileIndex [file:/Users/aryan/Desktop/project/data/parquet_data/title_basics.parquet]
PushedFilters: [IsNotNull(titleType), EqualTo(titleType,movie), IsNotNull(tconst)]
ReadSchema: struct<tconst:string,titleType:string,primaryTitle:string>

(2) Filter
Input [3]: [tconst#300, titleType#301, primaryTitle#302]
Condition : ((isnotnull(titleType#301) AND (titleType#301 = movie)) AND isnotnull(tconst#300))

(3) Project
Output [2]: [tconst#300, primaryTitle#302]
Input [3]: [tconst#300, titleType#301, primaryTitle#302]

(4) Exchange
Input [2]: [tconst#300, pri

In [16]:
movie_crew_df.show(5, truncate=False)



+---------+--------------------------------+---------+---------+
|tconst   |primaryTitle                    |directors|writers  |
+---------+--------------------------------+---------+---------+
|tt0000591|The Prodigal Son                |nm0141150|nm0141150|
|tt0000867|Fiesta de toros                 |nm0023107|\N       |
|tt0000868|Fiestas de Santa Lucía - Belenes|nm0005717|nm0005717|
|tt0000886|Hamlet                          |nm0099901|nm0000636|
|tt0001007|La primera y segunda casetas    |nm0185426|\N       |
+---------+--------------------------------+---------+---------+
only showing top 5 rows


                                                                                

In [17]:
title_basics = spark.read.parquet(parquet_files["title_basics"])
title_crew = spark.read.parquet(parquet_files["title_crew"])
title_akas = spark.read.parquet(parquet_files["title_akas"])
name_basics = spark.read.parquet(os.path.join(parquet_base, "name_basics.parquet"))



In [18]:
title_ratings = spark.read.parquet(os.path.join(parquet_base, "title_ratings.parquet"))


In [19]:
# Filter High-Rated Movies (Predicate Pushdown)
high_rated = title_ratings.filter(col("averageRating") >= 8.5)

In [20]:
high_rated.show(5, truncate=False)

+---------+-------------+--------+-------------------+
|tconst   |averageRating|numVotes|ingested_at        |
+---------+-------------+--------+-------------------+
|tt0000961|8.8          |23      |2025-06-23 00:14:51|
|tt0001025|8.7          |18      |2025-06-23 00:14:51|
|tt0001456|8.5          |21      |2025-06-23 00:14:51|
|tt0002145|9.0          |15      |2025-06-23 00:14:51|
|tt0002234|8.5          |28      |2025-06-23 00:14:51|
+---------+-------------+--------+-------------------+
only showing top 5 rows


In [21]:
high_rated_movies = high_rated.join(
    title_basics.filter(col("titleType") == "movie"),
    on="tconst",
    how="inner"
)

In [22]:
movie_with_crew = high_rated_movies.join(title_crew, on="tconst", how="left")

In [23]:
from pyspark.sql.functions import split, explode

# Explode director IDs (comma-separated)
movie_with_directors = movie_with_crew.withColumn("director_id", explode(split(col("directors"), ",")))

# Join with name_basics
movie_with_director_names = movie_with_directors.join(
    name_basics.select(col("nconst").alias("director_id"), col("primaryName")),
    on="director_id",
    how="left"
)

In [24]:
final_df = movie_with_director_names.join(
    title_akas.select("titleId", "title", "region"),
    movie_with_director_names.tconst == title_akas.titleId,
    how="left"
).drop("titleId")

In [26]:
final_df.show(5, truncate=False)

[Stage 76:>                                                         (0 + 8) / 8]

+-----------+---------+-------------+--------+-------------------+---------+----------------------+----------------+-------+---------+-------+--------------+-------+-------+-------------------+---------+---------+-------------------+------------+-----------------+------+
|director_id|tconst   |averageRating|numVotes|ingested_at        |titleType|primaryTitle          |originalTitle   |isAdult|startYear|endYear|runtimeMinutes|genre_1|genre_2|ingested_at        |directors|writers  |ingested_at        |primaryName |title            |region|
+-----------+---------+-------------+--------+-------------------+---------+----------------------+----------------+-------+---------+-------+--------------+-------+-------+-------------------+---------+---------+-------------------+------------+-----------------+------+
|nm0560785  |tt0007205|9.0          |16      |2025-06-23 00:14:51|movie    |Powder                |Powder          |0      |1916     |\N     |\N            |Drama  |War    |2025-06-23 

                                                                                

In [25]:
final_df.cache()

DataFrame[director_id: string, tconst: string, averageRating: double, numVotes: int, ingested_at: string, titleType: string, primaryTitle: string, originalTitle: string, isAdult: string, startYear: string, endYear: string, runtimeMinutes: string, genre_1: string, genre_2: string, ingested_at: string, directors: string, writers: string, ingested_at: string, primaryName: string, title: string, region: string]

In [None]:
# Use cache() when this DF is reused multiple times
# 	•	❌ Don’t cache unnecessarily (it eats memory)


In [27]:
output_df = final_df.select("primaryTitle", "primaryName", "averageRating", "region", "title")

In [28]:
output_df.show(5, truncate=False)

+----------------------+------------+-------------+------+-----------------+
|primaryTitle          |primaryName |averageRating|region|title            |
+----------------------+------------+-------------+------+-----------------+
|Powder                |Arthur Maude|9.0          |\N    |Powder           |
|Powder                |Arthur Maude|9.0          |US    |Powder           |
|The Monastery's Hunter|Franz Osten |8.5          |\N    |Der Klosterjäger |
|The Monastery's Hunter|Franz Osten |8.5          |HU    |A hegyek pásztora|
|The Monastery's Hunter|Franz Osten |8.5          |DE    |Der Klosterjäger |
+----------------------+------------+-------------+------+-----------------+
only showing top 5 rows


In [29]:
output_df = output_df.coalesce(1)

In [30]:
output_df.show(5, truncate=False)

+----------------------+------------+-------------+------+-----------------+
|primaryTitle          |primaryName |averageRating|region|title            |
+----------------------+------------+-------------+------+-----------------+
|Powder                |Arthur Maude|9.0          |\N    |Powder           |
|Powder                |Arthur Maude|9.0          |US    |Powder           |
|The Monastery's Hunter|Franz Osten |8.5          |\N    |Der Klosterjäger |
|The Monastery's Hunter|Franz Osten |8.5          |HU    |A hegyek pásztora|
|The Monastery's Hunter|Franz Osten |8.5          |DE    |Der Klosterjäger |
+----------------------+------------+-------------+------+-----------------+
only showing top 5 rows


In [31]:
# # Final cleanup
# output_df = final_df.select("primaryTitle", "primaryName", "averageRating", "region", "title")

# Optional: reduce partitions for small result set
output_df = output_df.coalesce(1)

# Save or display
output_df.show(10, truncate=False)

+-------------------------+-------------+-------------+------+-------------------------+
|primaryTitle             |primaryName  |averageRating|region|title                    |
+-------------------------+-------------+-------------+------+-------------------------+
|Powder                   |Arthur Maude |9.0          |\N    |Powder                   |
|Powder                   |Arthur Maude |9.0          |US    |Powder                   |
|The Monastery's Hunter   |Franz Osten  |8.5          |\N    |Der Klosterjäger         |
|The Monastery's Hunter   |Franz Osten  |8.5          |HU    |A hegyek pásztora        |
|The Monastery's Hunter   |Franz Osten  |8.5          |DE    |Der Klosterjäger         |
|The Monastery's Hunter   |Franz Osten  |8.5          |XWW   |The Monastery's Hunter   |
|The Rider of the King Log|Harry O. Hoyt|8.5          |\N    |The Rider of the King Log|
|The Rider of the King Log|Harry O. Hoyt|8.5          |GB    |The Rider of the King Log|
|The Rider of the Kin

In [32]:
output_df.explain(mode="formatted")

== Physical Plan ==
AdaptiveSparkPlan (81)
+- Coalesce (80)
   +- InMemoryTableScan (1)
         +- InMemoryRelation (2)
               +- AdaptiveSparkPlan (79)
                  +- == Final Plan ==
                     ResultQueryStage (49)
                     +- * Project (48)
                        +- * SortMergeJoin LeftOuter (47)
                           :- * Sort (40)
                           :  +- ShuffleQueryStage (39), Statistics(sizeInBytes=5.2 MiB, rowCount=1.32E+4)
                           :     +- Exchange (38)
                           :        +- * Project (37)
                           :           +- * SortMergeJoin LeftOuter (36)
                           :              :- * Sort (28)
                           :              :  +- ShuffleQueryStage (27), Statistics(sizeInBytes=4.9 MiB, rowCount=1.32E+4)
                           :              :     +- Exchange (26)
                           :              :        +- * Generate (25)
                    

In [None]:
# PySpark Complex Optimization Case Study using IMDB Dataset

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode, count, avg, broadcast, pandas_udf, PandasUDFType
from pyspark.sql.types import StringType
import os

# ------------------------- Spark Session -------------------------
custom_config = {
    "spark.executor.memory": "6g",
    "spark.driver.memory": "4g",
    "spark.default.parallelism": "8",
    "spark.sql.shuffle.partitions": "8",
    "spark.sql.adaptive.enabled": "true",
    "spark.sql.autoBroadcastJoinThreshold": "10485760",  # 10MB
    "spark.sql.execution.arrow.pyspark.enabled": "true"
}

spark = SparkSession.builder \
    .appName("IMDB_Advanced_Optimizations") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.executor.memory", custom_config["spark.executor.memory"]) \
    .config("spark.driver.memory", custom_config["spark.driver.memory"]) \
    .config("spark.default.parallelism", custom_config["spark.default.parallelism"]) \
    .config("spark.sql.shuffle.partitions", custom_config["spark.sql.shuffle.partitions"]) \
    .config("spark.sql.adaptive.enabled", custom_config["spark.sql.adaptive.enabled"]) \
    .config("spark.sql.autoBroadcastJoinThreshold", custom_config["spark.sql.autoBroadcastJoinThreshold"]) \
    .getOrCreate()

# ------------------------- Load Datasets -------------------------
base_path = "/Users/aryan/Desktop/project/data/parquet_data"

basics = spark.read.parquet(os.path.join(base_path, "title_basics.parquet"))
crew = spark.read.parquet(os.path.join(base_path, "title_crew.parquet"))
ratings = spark.read.parquet(os.path.join(base_path, "title_ratings.parquet"))
akas = spark.read.parquet(os.path.join(base_path, "title_akas.parquet"))
names = spark.read.parquet(os.path.join(base_path, "name_basics.parquet"))

# ------------------------- Optimization: Predicate Pushdown -------------------------
movies_df = basics.filter(col("titleType") == "movie")


25/06/23 09:55:15 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [None]:

# ------------------------- Optimization: Broadcast Join -------------------------
director_df = crew.withColumn("director_id", explode(split("directors", ",")))
name_df = names.select(col("nconst").alias("director_id"), "primaryName")

# Assume name_basics is small enough for broadcast
broadcast_name_df = broadcast(name_df)

joined_df = director_df.join(broadcast_name_df, on="director_id", how="left")




In [38]:
broadcast_name_df.show(5, truncate=False)

+-----------+-----------------+
|director_id|primaryName      |
+-----------+-----------------+
|nm10696194 |Monique Alexander|
|nm10696195 |Lyndon Gunning   |
|nm10696197 |Madi Paige Spar  |
|nm10696199 |Brielle Buie     |
|nm1069620  |Erickson Lantigua|
+-----------+-----------------+
only showing top 5 rows


25/06/23 10:03:39 WARN HintErrorLogger: A join hint (strategy=broadcast) is specified but it is not part of a join relation.


In [35]:
joined_df.show(5, truncate=False)

25/06/23 09:56:16 WARN MemoryStore: Not enough space to cache broadcast_80 in memory! (computed 1968.0 MiB so far)
25/06/23 09:56:16 WARN BlockManager: Persisting block broadcast_80 to disk instead.


+-----------+---------+---------+---------+-------------------+----------------+
|director_id|tconst   |directors|writers  |ingested_at        |primaryName     |
+-----------+---------+---------+---------+-------------------+----------------+
|nm5019406  |tt3803084|nm5019406|nm5019406|2025-06-23 09:18:25|Katie Bonham    |
|nm6236402  |tt3803086|nm6236402|\N       |2025-06-23 09:18:25|Ollie Durrant   |
|nm6568829  |tt3803088|nm6568829|nm6568829|2025-06-23 09:18:25|Julie Bergman   |
|nm0732218  |tt3803090|nm0732218|nm0240384|2025-06-23 09:18:25|Michael M. Robin|
|nm2005799  |tt3803092|nm2005799|nm1574898|2025-06-23 09:18:25|Dallas Thomas   |
+-----------+---------+---------+---------+-------------------+----------------+
only showing top 5 rows


25/06/23 09:56:24 WARN MemoryStore: Not enough space to cache broadcast_80 in memory! (computed 1968.0 MiB so far)


In [36]:
# ------------------------- Optimization: Multi Joins + Ratings -------------------------
movies_with_ratings = movies_df.join(ratings, on="tconst")

final_df = movies_with_ratings \
    .join(joined_df, on="tconst", how="left") \
    .join(akas.select("titleId", "title", "region"), movies_with_ratings.tconst == akas.titleId, how="left") \
    .drop("titleId")

In [37]:
final_df.show(5, truncate=False)

25/06/23 10:02:07 WARN MemoryStore: Not enough space to cache broadcast_91 in memory! (computed 1968.0 MiB so far)
25/06/23 10:02:07 WARN BlockManager: Persisting block broadcast_91 to disk instead.
25/06/23 10:02:14 WARN MemoryStore: Not enough space to cache broadcast_91 in memory! (computed 1968.0 MiB so far)
[Stage 125:>                                                        (0 + 1) / 1]

+---------+---------+----------------+------------------+-------+---------+-------+--------------+-------+-------+-------------------+-------------+--------+-------------------+-----------+---------+---------+-------------------+----------------+-----------------+------+
|tconst   |titleType|primaryTitle    |originalTitle     |isAdult|startYear|endYear|runtimeMinutes|genre_1|genre_2|ingested_at        |averageRating|numVotes|ingested_at        |director_id|directors|writers  |ingested_at        |primaryName     |title            |region|
+---------+---------+----------------+------------------+-------+---------+-------+--------------+-------+-------+-------------------+-------------+--------+-------------------+-----------+---------+---------+-------------------+----------------+-----------------+------+
|tt0000591|movie    |The Prodigal Son|L'enfant prodigue |0      |1907     |\N     |90            |Drama  |NULL   |2025-06-23 09:17:33|5.4          |33      |2025-06-23 00:14:51|nm01411

                                                                                

In [None]:
# ------------------------- Optimization: Multi Joins + Ratings -------------------------
movies_with_ratings = movies_df.join(ratings, on="tconst")

final_df = movies_with_ratings \
    .join(joined_df, on="tconst", how="left") \
    .join(akas.select("titleId", "title", "region"), movies_with_ratings.tconst == akas.titleId, how="left") \
    .drop("titleId")

In [39]:
final_df.show(5, truncate=False)

25/06/23 10:07:14 WARN MemoryStore: Not enough space to cache broadcast_105 in memory! (computed 1968.0 MiB so far)
25/06/23 10:07:14 WARN BlockManager: Persisting block broadcast_105 to disk instead.
25/06/23 10:07:22 WARN MemoryStore: Not enough space to cache broadcast_105 in memory! (computed 1968.0 MiB so far)
[Stage 136:>                                                        (0 + 1) / 1]

+---------+---------+----------------+------------------+-------+---------+-------+--------------+-------+-------+-------------------+-------------+--------+-------------------+-----------+---------+---------+-------------------+----------------+-----------------+------+
|tconst   |titleType|primaryTitle    |originalTitle     |isAdult|startYear|endYear|runtimeMinutes|genre_1|genre_2|ingested_at        |averageRating|numVotes|ingested_at        |director_id|directors|writers  |ingested_at        |primaryName     |title            |region|
+---------+---------+----------------+------------------+-------+---------+-------+--------------+-------+-------+-------------------+-------------+--------+-------------------+-----------+---------+---------+-------------------+----------------+-----------------+------+
|tt0000591|movie    |The Prodigal Son|L'enfant prodigue |0      |1907     |\N     |90            |Drama  |NULL   |2025-06-23 09:17:33|5.4          |33      |2025-06-23 00:14:51|nm01411

                                                                                

In [40]:

# ------------------------- Optimization: Window Function + Partitioning -------------------------
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

windowSpec = Window.partitionBy("region").orderBy(col("averageRating").desc())
region_top_movies = final_df.withColumn("rank", row_number().over(windowSpec)).filter(col("rank") <= 3)


In [41]:
region_top_movies.show(10, truncate=False)

25/06/23 10:08:51 WARN MemoryStore: Not enough space to cache broadcast_117 in memory! (computed 1968.0 MiB so far)
25/06/23 10:08:51 WARN BlockManager: Persisting block broadcast_117 to disk instead.
25/06/23 10:08:59 WARN MemoryStore: Not enough space to cache broadcast_117 in memory! (computed 1968.0 MiB so far)



25/06/23 10:09:02 WARN TaskMemoryManager: Failed to allocate a page (16777200 bytes), try again.

+---------+---------+-------------------------------------------------+-------------------------+----------------------------------------+-----------------------+-------+--------------+---------+---------+-------------------+-------------+--------+-------------------+-----------+---------+---------+-------------------+-----------------+---------------------+----------------------------------------+----+
|tconst   |titleType|primaryTitle                                     |originalTitle            |isAdult                                 |startYear              |endYear|runtimeMinutes|genre_1  |genre_2  |ingested_at        |averageRating|numVotes|ingested_at        |director_id|directors|writers  |ingested_at        |primaryName      |title                |region                                  |rank|
+---------+---------+-------------------------------------------------+-------------------------+----------------------------------------+-----------------------+-------+--------------+-

                                                                                

In [43]:

# ------------------------- Optimization: Pandas UDF -------------------------
@pandas_udf(StringType(), PandasUDFType.SCALAR)
def clean_region_code(region_series):
    return region_series.fillna("XX").str.upper()

region_top_movies = region_top_movies.withColumn("region", clean_region_code("region"))




In [45]:
region_top_movies.show()

25/06/23 10:13:36 WARN MemoryStore: Not enough space to cache broadcast_130 in memory! (computed 1968.0 MiB so far)
25/06/23 10:13:36 WARN BlockManager: Persisting block broadcast_130 to disk instead.
25/06/23 10:13:44 WARN MemoryStore: Not enough space to cache broadcast_130 in memory! (computed 1968.0 MiB so far)
[Stage 168:>                                                        (0 + 1) / 1]

+----------+---------+--------------------+--------------------+--------------------+--------------------+-------+--------------+---------+-----------+-------------------+-------------+--------+-------------------+-----------+---------+----------+-------------------+-----------------+--------------------+--------------------+----+
|    tconst|titleType|        primaryTitle|       originalTitle|             isAdult|           startYear|endYear|runtimeMinutes|  genre_1|    genre_2|        ingested_at|averageRating|numVotes|        ingested_at|director_id|directors|   writers|        ingested_at|      primaryName|               title|              region|rank|
+----------+---------+--------------------+--------------------+--------------------+--------------------+-------+--------------+---------+-----------+-------------------+-------------+--------+-------------------+-----------+---------+----------+-------------------+-----------------+--------------------+--------------------+----+
|

                                                                                

In [46]:

# ------------------------- Shuffle Optimization: groupByKey vs reduceByKey -------------------------
# Simulated with rdd on genre_1
rdd = movies_df.select("genre_1", "tconst").rdd

# BAD: groupByKey (causes wide shuffle)
grouped_rdd = rdd.groupByKey().mapValues(lambda x: len(list(x)))

# GOOD: reduceByKey (combines locally before shuffle)
reduced_rdd = rdd.map(lambda x: (x[0], 1)).reduceByKey(lambda x, y: x + y)

print("Genre counts using reduceByKey:")
print(reduced_rdd.take(10))

# ------------------------- Export or Display -------------------------
region_top_movies.select("primaryTitle", "primaryName", "averageRating", "region", "title", "rank").show(truncate=False)



Genre counts using reduceByKey:


                                                                                

[('Drama', 179774), ('150', 7), ('54', 12), ('Fantasy', 5006), ('Family', 5050), ('121', 11), ('Sport', 2160), ('50', 14), ('112', 20), ('Musical', 3198)]


25/06/23 10:15:14 WARN MemoryStore: Not enough space to cache broadcast_147 in memory! (computed 1968.0 MiB so far)
25/06/23 10:15:14 WARN BlockManager: Persisting block broadcast_147 to disk instead.
25/06/23 10:15:23 WARN MemoryStore: Not enough space to cache broadcast_147 in memory! (computed 1968.0 MiB so far)



25/06/23 10:15:25 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.

+-------------------------------------------------+-----------------------+-------------+------------------------------------------------------------+---------------------------------------------------------------------+----+
|primaryTitle                                     |primaryName            |averageRating|region                                                      |title                                                                |rank|
+-------------------------------------------------+-----------------------+-------------+------------------------------------------------------------+---------------------------------------------------------------------+----+
|The Wonderful Story of Henry Sugar and Three More|Wes Anderson           |7.1          | 'A PATKÁNYFOGÓ' ÉS A 'MÉREG'                               |Henry Sugar csodálatos története és három további novella: 'A hattyú'|1   |
|Isao Takahata and His Tale of Princess Kaguya    |Akira Miki             |7.5          | 'KAGUY

                                                                                

In [47]:
import os
import sys

# Setup sys path to use config
sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"), "../../")))

from src.config import settings
from src.config.spark_config import get_spark_session
from pyspark.sql.functions import col, explode, split, count, desc
from pyspark.sql.types import StringType

# Optional: install pyarrow and pandas >= 2.0 if not already
# pip install pyarrow pandas --upgrade

# ----------------------
# Step 1: Tuning Config
# ----------------------
custom_tuning = {
    "spark.executor.memory": "8g",
    "spark.driver.memory": "6g",
    "spark.executor.cores": "4",
    "spark.sql.shuffle.partitions": "8",
    "spark.default.parallelism": "8",
    "spark.memory.fraction": "0.8",
    "spark.sql.adaptive.enabled": "true",
    "spark.sql.autoBroadcastJoinThreshold": "-1",
    "spark.sql.execution.arrow.pyspark.enabled": "true"
}

spark = get_spark_session(app_name="IMDB_Optimized_ETL", custom_config=custom_tuning)

# ----------------------
# Step 2: Load Data
# ----------------------
parquet_base = os.path.join(settings.BASE_DIR, "data", "parquet_data")
files = {
    "basics": "title_basics.parquet",
    "akas": "title_akas.parquet",
    "crew": "title_crew.parquet",
    "name": "name_basics.parquet",
    "ratings": "title_ratings.parquet"
}
df_basics = spark.read.parquet(os.path.join(parquet_base, files["basics"]))
df_crew = spark.read.parquet(os.path.join(parquet_base, files["crew"]))
df_names = spark.read.parquet(os.path.join(parquet_base, files["name"]))
df_ratings = spark.read.parquet(os.path.join(parquet_base, files["ratings"]))

# Pushdown filter (predicate pushdown happens here)
df_movies = df_basics.filter((col("titleType") == "movie") & (col("isAdult") == "0"))

# ----------------------
# Step 3: Repartition + Persist
# ----------------------
df_movies = df_movies.repartition(8).persist()
df_ratings = df_ratings.repartition(4).persist()
df_crew = df_crew.repartition(4).persist()

# ----------------------
# Step 4: Join Ratings with Movies
# ----------------------
df_joined = df_movies.join(df_ratings, on="tconst", how="inner")

# ----------------------
# Step 5: Enrich with Crew (director/writer)
# ----------------------
df_joined = df_joined.join(df_crew, on="tconst", how="left")

# ----------------------
# Step 6: Enrich with Director Name
# ----------------------
df_names = df_names.select("nconst", "primaryName").withColumnRenamed("primaryName", "directorName")

df_joined = df_joined.withColumn("director_id", split(col("directors"), ",").getItem(0))
df_joined = df_joined.join(df_names, df_joined["director_id"] == df_names["nconst"], how="left")

# ----------------------
# Step 7: Final Output - Top Rated Movies with Directors
# ----------------------
result = df_joined.select(
    "primaryTitle", "startYear", "runtimeMinutes",
    "averageRating", "numVotes", "directorName"
).orderBy(desc("averageRating"))

result.show(10, truncate=False)

# ----------------------
# Step 8: Explain Plan
# ----------------------
print("⚙️ EXPLAIN PLAN:")
result.explain(mode="formatted")

# ----------------------
# Step 9: groupByKey vs reduceByKey demo
# ----------------------
from pyspark.sql import Row
from pyspark import SparkContext

sc = spark.sparkContext

sample_data = [
    ("Drama", 1), ("Action", 1), ("Drama", 1),
    ("Comedy", 1), ("Action", 1), ("Drama", 1)
]
rdd = sc.parallelize(sample_data)

print("✅ reduceByKey (recommended):")
rdd.reduceByKey(lambda a, b: a + b).collect()

print("⚠️ groupByKey (memory heavy):")
grouped = rdd.groupByKey().mapValues(lambda x: sum(x)).collect()




25/06/23 10:16:13 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

+--------------------------------------+---------+--------------+-------------+--------+----------------------------+
|primaryTitle                          |startYear|runtimeMinutes|averageRating|numVotes|directorName                |
+--------------------------------------+---------+--------------+-------------+--------+----------------------------+
|Guru Badul                            |1988     |110           |10.0         |5       |A.R. Badul                  |
|The New Guest                         |2025     |\N            |10.0         |7       |Delinda Kay                 |
|One Decision                          |2022     |48            |10.0         |6       |George Kirvin               |
|Adana Taskapi                         |2022     |\N            |10.0         |6       |Ali Özgün                   |
|La casa italiana                      |2025     |\N            |10.0         |9       |Bob Spjuth                  |
|Saybrook: The Tully Girls             |2011     |67    

In [49]:
# # ----------------------
# # Step 10: Optional Pandas UDF (if available)
# # ----------------------
# try:
#     import pandas as pd
#     import pyarrow

#     from pyspark.sql.functions import pandas_udf
#     from pyspark.sql.types import DoubleType

#     @pandas_udf(DoubleType())
#     def convert_minutes_to_hours(runtime: pd.Series) -> pd.Series:
#         return runtime.fillna("0").astype(float) / 60.0

#     df_with_hours = result.withColumn("runtimeHours", convert_minutes_to_hours("runtimeMinutes"))
#     df_with_hours.select("primaryTitle", "runtimeMinutes", "runtimeHours").show(5)
# except ImportError:
#     print("❌ Pandas or PyArrow not installed. Skipping vectorized UDF.")


In [67]:
# This is a configuration-level concept, not code-specific
print("⚙️ Vertical Scaling → More memory/cores per executor (e.g., spark.executor.memory = '6g')")
print("⚙️ Horizontal Scaling → More executors/partitions (e.g., spark.default.parallelism = 8)")

⚙️ Vertical Scaling → More memory/cores per executor (e.g., spark.executor.memory = '6g')
⚙️ Horizontal Scaling → More executors/partitions (e.g., spark.default.parallelism = 8)


In [68]:
# Partition by startYear (simulates better filtering performance)
output_path = "/tmp/imdb_partitioned"
df_movies.write.partitionBy("startYear").mode("overwrite").parquet(output_path)

# Read with filter to show partition pruning
df_partitioned = spark.read.parquet(output_path)
df_2020 = df_partitioned.filter(col("startYear") == "2020")
df_2020.show(3)

                                                                                

+----------+---------+--------------+--------------+-------+-------+--------------+--------+-------+-------------------+---------+
|    tconst|titleType|  primaryTitle| originalTitle|isAdult|endYear|runtimeMinutes| genre_1|genre_2|        ingested_at|startYear|
+----------+---------+--------------+--------------+-------+-------+--------------+--------+-------+-------------------+---------+
|tt10384246|    movie|Friend Request|Friend Request|      0|     \N|            87|Thriller|   NULL|2025-06-23 09:17:33|     2020|
|tt10214754|    movie|          Luca|          Luca|      0|     \N|           104|   Crime|  Drama|2025-06-23 09:17:33|     2020|
|tt10582422|    movie|Sweet Sunshine|Sweet Sunshine|      0|     \N|            93|   Drama|   NULL|2025-06-23 09:17:33|     2020|
+----------+---------+--------------+--------------+-------+-------+--------------+--------+-------+-------------------+---------+
only showing top 3 rows


In [69]:
# Save table using bucketing (simulated here)
spark.sql("DROP TABLE IF EXISTS bucketed_movies")
df_movies.write.bucketBy(8, "genre_1").sortBy("primaryTitle").saveAsTable("bucketed_movies", format="parquet")

print(" Bucketed table 'bucketed_movies' created for faster joins")

[Stage 315:>                                                        (0 + 8) / 8]

 Bucketed table 'bucketed_movies' created for faster joins


                                                                                

In [72]:
# Broadcast join (fast if right table is small)
df_names_small = df_names.limit(10000)

df_broadcast = df_movies.join(broadcast(df_names_small), df_movies.tconst == df_names_small.nconst, "left")
print(" Broadcast Join Count:", df_broadcast.count())

# Shuffle join (default)
df_shuffle = df_movies.join(df_names, df_movies.tconst == df_names.nconst, "left")
print("Shuffle Join Count:", df_shuffle.count())

 Broadcast Join Count: 688717
Shuffle Join Count: 688717


In [73]:
from pyspark.sql.functions import when, rand, concat, lit

# Artificial skew
df_skewed = df_movies.withColumn("skew_key", when(col("startYear") == "2020", "high").otherwise("low"))

# Salting technique
df_skewed = df_skewed.withColumn("salt", (rand() * 10).cast("int"))
df_skewed = df_skewed.withColumn("skew_key_salted", concat(col("skew_key"), lit("_"), col("salt")))

df_skewed.groupBy("skew_key_salted").count().orderBy("count", ascending=False).show(5)

+---------------+-----+
|skew_key_salted|count|
+---------------+-----+
|          low_7|67801|
|          low_3|67576|
|          low_6|67486|
|          low_9|67368|
|          low_5|67361|
+---------------+-----+
only showing top 5 rows


In [74]:
title_crew1 = spark.read.parquet(parquet_files["title_crew"])

In [75]:
dirc = Row(tconst = "tt_skewed", directors = "nm999999", writers= "nm1234567",ingested_at = "2025-06-23 01:00:00")

In [76]:
skew_row  = [dirc] * 50000  # Create 1000 skewed rows
skewed_df = spark.createDataFrame(skew_row)

In [77]:
crw = title_crew1.union(skewed_df)

In [78]:
crw.groupBy("directors").count().orderBy("count", ascending=False).show(5)

[Stage 347:>                                                       (0 + 8) / 16]

+---------+-------+
|directors|  count|
+---------+-------+
|       \N|5102640|
| nm999999|  50000|
|nm1203430|  13151|
|nm8467983|  11147|
|nm1409127|  10362|
+---------+-------+
only showing top 5 rows


                                                                                

In [79]:
df_skewed.show()

+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+-------+-------------------+--------+----+---------------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|    genre_1|genre_2|        ingested_at|skew_key|salt|skew_key_salted|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+-------+-------------------+--------+----+---------------+
| tt0178179|    movie|     Aakhri Nishchay|     Aakhri Nishchay|      0|     1988|     \N|            \N|      Drama|   NULL|2025-06-23 09:17:33|     low|   8|          low_8|
| tt0457337|    movie|  Ein Erpressertrick|  Ein Erpressertrick|      0|     1921|     \N|            \N|     Action|  Crime|2025-06-23 09:17:33|     low|   1|          low_1|
| tt0048423|    movie|The Night Holds T...|The Night Holds T...|      0|     1955|     \N|            86|      Crime|  D

In [80]:
spark.sparkContext.setCheckpointDir("/tmp/checkpoints")
df_check = df_movies.select("primaryTitle", "startYear").checkpoint()
print("✅ Checkpoint done. Safe lineage created.")
df_check.show(3)

✅ Checkpoint done. Safe lineage created.
+--------------------+---------+
|        primaryTitle|startYear|
+--------------------+---------+
|     Aakhri Nishchay|     1988|
|  Ein Erpressertrick|     1921|
|The Night Holds T...|     1955|
+--------------------+---------+
only showing top 3 rows


In [81]:
print("💡 Tip: Enable in production for straggler handling")
print("spark.conf.set('spark.speculation', 'true')")

💡 Tip: Enable in production for straggler handling
spark.conf.set('spark.speculation', 'true')


25/06/23 15:33:02 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 846446 ms exceeds timeout 120000 ms
25/06/23 15:33:02 WARN SparkContext: Killing executors is not supported by current scheduler.
25/06/23 15:33:05 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:342)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$