In [3]:

import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"), "../../")))

from config import settings
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim
import os
from src.config.spark_config import get_spark_session


from src.config.spark_config import get_spark_session
custom_config = {
    "spark.executor.memory": "6g",
    "spark.driver.memory": "4g",
    "spark.executor.cores": "4",
    "spark.sql.shuffle.partitions": "8",
    "spark.default.parallelism": "8",
    "spark.sql.adaptive.enabled": "true",
    "spark.sql.autoBroadcastJoinThreshold": "104857600",  # 100MB
    "spark.speculation": "true"
}
spark = get_spark_session("IMDB_Tuning_Demo", custom_config)

from pyspark.sql import SparkSession
# Optional: custom tuning for this session



# Step 2: Define Parquet folder path
parquet_base = os.path.join(settings.BASE_DIR, "data", "parquet_data")

# ✅ Cell 2: Load IMDB Parquet Data
import os
from config import settings

parquet_base = os.path.join(settings.BASE_DIR, "data", "parquet_data")
title_basics = spark.read.parquet(os.path.join(parquet_base, "title_basics.parquet"))
title_ratings = spark.read.parquet(os.path.join(parquet_base, "title_ratings.parquet"))
title_akas = spark.read.parquet(os.path.join(parquet_base, "title_akas.parquet"))
title_crew = spark.read.parquet(os.path.join(parquet_base, "title_crew.parquet"))
name_basics = spark.read.parquet(os.path.join(parquet_base, "name_basics.parquet"))

 

In [4]:
# groupByKey() vs reduceByKey() Demo
rdd = title_ratings.select("tconst", "averageRating").rdd.map(lambda row: (row[0], float(row[1])))
grouped = rdd.groupByKey().mapValues(lambda x: sum(x)/len(x)).take(5)
reduced = rdd.mapValues(lambda x: (x, 1)) \
              .reduceByKey(lambda a, b: (a[0]+b[0], a[1]+b[1])) \
              .mapValues(lambda x: x[0]/x[1]).take(5)

                                                                                

In [5]:
# ✅ Cell 4: Change Shuffle Partitions Effect
spark.conf.set("spark.sql.shuffle.partitions", "2")
title_ratings.groupBy("averageRating").count().show()
# ✅ Lower partitions can improve performance for small datasets but may bottleneck large jobs.

+-------------+-----+
|averageRating|count|
+-------------+-----+
|          6.5|35207|
|          5.3|15551|
|          7.4|57046|
|          6.1|27237|
|          5.9|22282|
|          4.7| 9013|
|          5.1|12126|
|          4.2| 6968|
|          5.6|22139|
|          4.9| 9782|
|          4.8|11772|
|          4.4| 7907|
|          3.6| 4079|
|          3.4| 3313|
|          4.0| 5792|
|          3.8| 4802|
|          3.0| 2417|
|          6.4|37743|
|          4.3| 6290|
|          3.9| 4032|
+-------------+-----+
only showing top 20 rows


In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
def title_length(title): return len(title or "")
length_udf = udf(title_length, IntegerType())
title_basics.withColumn("title_len", length_udf(title_basics.primaryTitle)).show(5)
# ❌ Python UDFs are slow and break optimizations.
# ✅ Cell 6: Use Built-in Functions Instead



+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+---------+-------------------+---------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|    genre_1|  genre_2|        ingested_at|title_len|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+---------+-------------------+---------+
|tt32444893|tvEpisode|McCraw vs. Herrin...|McCraw vs. Herrin...|      0|     2023|     \N|            \N| Reality-TV|     NULL|2025-06-22 23:11:21|       21|
|tt32444895|tvEpisode|          Ed Genesis|          Ed Genesis|      0|     2019|     \N|            \N|Documentary|     NULL|2025-06-22 23:11:21|       10|
|tt32444899|tvEpisode|    Aikens vs. Brown|    Aikens vs. Brown|      0|     2023|     \N|            \N| Reality-TV|     NULL|2025-06-22 23:11:21|       16|
| tt3244490|tvEpisode|From Marshes to M...|From Mars

In [None]:
from pyspark.sql.functions import length
title_basics.withColumn("title_len", length(title_basics.primaryTitle)).show(5)
# ✅ Built-in functions are optimized and faster.


+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+---------+-------------------+---------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|    genre_1|  genre_2|        ingested_at|title_len|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+---------+-------------------+---------+
|tt32444893|tvEpisode|McCraw vs. Herrin...|McCraw vs. Herrin...|      0|     2023|     \N|            \N| Reality-TV|     NULL|2025-06-22 23:11:21|       21|
|tt32444895|tvEpisode|          Ed Genesis|          Ed Genesis|      0|     2019|     \N|            \N|Documentary|     NULL|2025-06-22 23:11:21|       10|
|tt32444899|tvEpisode|    Aikens vs. Brown|    Aikens vs. Brown|      0|     2023|     \N|            \N| Reality-TV|     NULL|2025-06-22 23:11:21|       16|
| tt3244490|tvEpisode|From Marshes to M...|From Mars

In [10]:
# ✅ Cell 7: Use Broadcast Joins for Small Tables
from pyspark.sql.functions import broadcast
joined_df = title_basics.join(broadcast(title_ratings), "tconst", "left") \
    .select(title_basics.tconst, title_basics.primaryTitle, title_ratings.averageRating) \
    .orderBy("averageRating", ascending=False)
joined_df.show(5)
# ✅ Broadcast joins are efficient for small tables, reducing shuffle overhead.     



+----------+--------------------+-------------+
|    tconst|        primaryTitle|averageRating|
+----------+--------------------+-------------+
| tt8100968|        Episode #1.1|         10.0|
|tt14049104|Filmmaker Jhon Ja...|         10.0|
| tt0281732| Closed for Business|         10.0|
|tt14044668|Interview with Fi...|         10.0|
|tt26425618|       Episode #1.28|         10.0|
+----------+--------------------+-------------+
only showing top 5 rows


                                                                                

In [11]:

# ✅ Cell 7: Pandas UDF with Arrow
from pyspark.sql.functions import pandas_udf
import pandas as pd
@pandas_udf("int")
def pandas_title_len(s: pd.Series) -> pd.Series:
    return s.str.len()
title_basics.withColumn("title_len", pandas_title_len(title_basics.primaryTitle)).show(5)
#

+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+---------+-------------------+---------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|    genre_1|  genre_2|        ingested_at|title_len|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+---------+-------------------+---------+
|tt32444893|tvEpisode|McCraw vs. Herrin...|McCraw vs. Herrin...|      0|     2023|     \N|            \N| Reality-TV|     NULL|2025-06-22 23:11:21|       21|
|tt32444895|tvEpisode|          Ed Genesis|          Ed Genesis|      0|     2019|     \N|            \N|Documentary|     NULL|2025-06-22 23:11:21|       10|
|tt32444899|tvEpisode|    Aikens vs. Brown|    Aikens vs. Brown|      0|     2023|     \N|            \N| Reality-TV|     NULL|2025-06-22 23:11:21|       16|
| tt3244490|tvEpisode|From Marshes to M...|From Mars

In [12]:
 #✅ Vectorized UDFs leverage Arrow for better performance.

# ✅ Cell 8: Use Arrow for toPandas conversion
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
title_basics.limit(5).toPandas()
# ✅ Arrow accelerates PySpark to Pandas conversion.


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genre_1,genre_2,ingested_at
0,tt32444893,tvEpisode,McCraw vs. Herrington,McCraw vs. Herrington,0,2023,\N,\N,Reality-TV,,2025-06-22 23:11:21
1,tt32444895,tvEpisode,Ed Genesis,Ed Genesis,0,2019,\N,\N,Documentary,,2025-06-22 23:11:21
2,tt32444899,tvEpisode,Aikens vs. Brown,Aikens vs. Brown,0,2023,\N,\N,Reality-TV,,2025-06-22 23:11:21
3,tt3244490,tvEpisode,From Marshes to Mountain Heights,From Marshes to Mountain Heights,0,2002,\N,21,Action,Adventure,2025-06-22 23:11:21
4,tt32444901,tvEpisode,Orive vs. Stark,Orive vs. Stark,0,2023,\N,\N,Reality-TV,,2025-06-22 23:11:21


In [13]:

# ✅ Cell 9: Bucketing vs Partitioning
bucketed = title_ratings.write.bucketBy(8, "tconst").sortBy("averageRating").saveAsTable("bucketed_ratings")
partitioned = title_ratings.write.partitionBy("averageRating").parquet("/tmp/partitioned_ratings")
# ✅ Bucketing helps with join performance; partitioning with scan performance.


                                                                                

In [14]:

# ✅ Cell 10: Broadcast Join
from pyspark.sql.functions import broadcast
joined_df = title_ratings.join(broadcast(title_basics), on="tconst")
joined_df.select("tconst", "averageRating", "primaryTitle").show(5)
# ✅ Broadcast join avoids shuffle for small tables.


25/06/23 00:48:54 WARN MemoryStore: Not enough space to cache broadcast_37 in memory! (computed 1760.0 MiB so far)
25/06/23 00:48:54 WARN BlockManager: Persisting block broadcast_37 to disk instead.


+---------+-------------+--------------------+
|   tconst|averageRating|        primaryTitle|
+---------+-------------+--------------------+
|tt0000001|          5.7|          Carmencita|
|tt0000002|          5.5|Le clown et ses c...|
|tt0000003|          6.5|        Poor Pierrot|
|tt0000004|          5.3|         Un bon bock|
|tt0000005|          6.2|    Blacksmith Scene|
+---------+-------------+--------------------+
only showing top 5 rows


25/06/23 00:49:01 WARN MemoryStore: Not enough space to cache broadcast_37 in memory! (computed 1760.0 MiB so far)


In [15]:

# ✅ Cell 11: Skew Handling (Salting)
from pyspark.sql.functions import monotonically_increasing_id, concat_ws
skewed = title_ratings.withColumn("salt", (monotonically_increasing_id() % 5))
# Use salt key to distribute skewed join keys
# ✅ Useful for reducing skew during joins.


In [16]:

# ✅ Cell 12: Checkpointing
spark.sparkContext.setCheckpointDir("/tmp/spark_checkpoints")
checkpointed = title_basics.filter("titleType = 'movie'").checkpoint()
checkpointed.show(2)
# ✅ Avoid recomputation by checkpointing intermediate RDDs.
 



+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+-------+-------------------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|    genre_1|genre_2|        ingested_at|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+-------+-------------------+
|tt32444915|    movie|Lightning in a Bo...|Lightning in a Bo...|      0|       \N|     \N|            90|Documentary|   NULL|2025-06-22 23:11:21|
|tt32444916|    movie|     A Night of Fate|     A Night of Fate|      0|     2024|     \N|            54|      Drama|   NULL|2025-06-22 23:11:21|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+-------+-------------------+
only showing top 2 rows


                                                                                

In [None]:

# ✅ Cell 13: Speculative Execution
# Already enabled in config, useful if a task is slow due to node slowness
# Cannot demo directly but improves reliability in real clusters
# ✅ Helps avoid straggler tasks in large distributed jobs.
# ✅ Cell 14: Caching
title_basics.cache()
title_basics.count()  # Trigger cache
title_basics.show(5)
# ✅ Caching avoids recomputation, speeding up repeated access to the same data.




+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+---------+-------------------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|    genre_1|  genre_2|        ingested_at|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+---------+-------------------+
|tt32444893|tvEpisode|McCraw vs. Herrin...|McCraw vs. Herrin...|      0|     2023|     \N|            \N| Reality-TV|     NULL|2025-06-22 23:11:21|
|tt32444895|tvEpisode|          Ed Genesis|          Ed Genesis|      0|     2019|     \N|            \N|Documentary|     NULL|2025-06-22 23:11:21|
|tt32444899|tvEpisode|    Aikens vs. Brown|    Aikens vs. Brown|      0|     2023|     \N|            \N| Reality-TV|     NULL|2025-06-22 23:11:21|
| tt3244490|tvEpisode|From Marshes to M...|From Marshes to M...|      0|     2002|     \N|            21|     Ac

                                                                                

In [None]:
# ✅ Cell 15: Data Skew Handling
from pyspark.sql.functions import col, count
skewed_counts = title_ratings.groupBy("averageRating").agg(count("*").alias("count")) \
    .orderBy(col("count").desc())
skewed_counts.show(5)
    

+-------------+-----+
|averageRating|count|
+-------------+-----+
|          7.2|59762|
|          7.4|57046|
|          7.6|56657|
|          7.8|55104|
|          7.0|54131|
+-------------+-----+
only showing top 5 rows


In [20]:
# ✅ Identify skewed data distributions to optimize joins and aggregations.
# ✅ Cell 16: Adaptive Query Execution
spark.conf.set("spark.sql.adaptive.enabled", "true")
# This is already set in the custom config
# Adaptive Query Execution optimizes query plans based on runtime statistics.
# ✅ Cell 17: Dynamic Partition Pruning
# Already enabled in config, useful for optimizing joins with partitioned tables
# Cannot demo directly but improves join performance with partitioned data
# ✅ Dynamic partition pruning optimizes joins by pruning unnecessary partitions at runtime.
# ✅ Cell 18: DataFrame API vs RDD API
# DataFrame API is optimized and provides better performance than RDD API
# Example: Using DataFrame API for filtering
filtered_df = title_basics.filter(col("titleType") == "movie").select("tconst", "primaryTitle")
filtered_df.show(5)
# ✅ DataFrame API leverages Catalyst optimizer for better performance.        

+----------+--------------------+
|    tconst|        primaryTitle|
+----------+--------------------+
|tt32444915|Lightning in a Bo...|
|tt32444916|     A Night of Fate|
|tt32444973|Mysterious Origin...|
|tt32445107|             The Ten|
| tt3244512|   Charlie's Country|
+----------+--------------------+
only showing top 5 rows
