In [19]:
# STEP 0: Setup & Spark
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"), "../../")))

from config import settings
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim

spark = SparkSession.builder \
    .appName("Synthetic_ETL_Pipeline") \
    .config("spark.jars", settings.JDBC_PATH) \
    .getOrCreate()

print("✅ Spark session started.")

ConnectionRefusedError: [Errno 61] Connection refused

In [5]:
from config.spark_config import get_spark_session
from config import settings

# Step 3: Start Spark session
spark = get_spark_session(app_name="Read_Parquet_Tuning")

# Step 4: Define Parquet folder path
parquet_base = os.path.join(settings.BASE_DIR, "data", "parquet_data")

# Step 5: Read each file
parquet_files = {
    "title_basics": os.path.join(parquet_base, "title_basics.parquet"),
    "title_crew": os.path.join(parquet_base, "title_crew.parquet"),
    "title_episode": os.path.join(parquet_base, "title_episode.parquet"),
    "title_akas": os.path.join(parquet_base, "title_akas.parquet")
}

# Step 6: Show sample records
for name, path in parquet_files.items():
    print(f"📄 Reading {name}")
    df = spark.read.parquet(path)
    df.show(5)

📄 Reading title_basics
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+---------+-------------------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|    genre_1|  genre_2|        ingested_at|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+---------+-------------------+
|tt32444893|tvEpisode|McCraw vs. Herrin...|McCraw vs. Herrin...|      0|     2023|     \N|            \N| Reality-TV|     NULL|2025-06-22 23:11:21|
|tt32444895|tvEpisode|          Ed Genesis|          Ed Genesis|      0|     2019|     \N|            \N|Documentary|     NULL|2025-06-22 23:11:21|
|tt32444899|tvEpisode|    Aikens vs. Brown|    Aikens vs. Brown|      0|     2023|     \N|            \N| Reality-TV|     NULL|2025-06-22 23:11:21|
| tt3244490|tvEpisode|From Marshes to M...|From Marshes to M...|      0|     2002|     \N

25/06/22 23:23:58 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+----------+---------+---------+-------------------+
|    tconst|directors|  writers|        ingested_at|
+----------+---------+---------+-------------------+
| tt2239602|nm0808310|       \N|2025-06-22 23:12:07|
| tt2239604|       \N|nm0317102|2025-06-22 23:12:07|
| tt2239606|       \N|       \N|2025-06-22 23:12:07|
|tt22396070|nm0011612|nm0243626|2025-06-22 23:12:07|
|tt22396074|nm0011612|nm0243626|2025-06-22 23:12:07|
+----------+---------+---------+-------------------+
only showing top 5 rows
📄 Reading title_episode
+----------+------------+------------+-------------+-------------------+
|    tconst|parentTconst|seasonNumber|episodeNumber|        ingested_at|
+----------+------------+------------+-------------+-------------------+
|tt36958755|   tt2190581|          \N|           \N|2025-06-22 23:12:27|
|tt36958759|   tt0312135|           1|            1|2025-06-22 23:12:27|
| tt3695876|   tt1287391|           1|           76|2025-06-22 23:12:27|
|tt36958761|   tt0185121|          \N

In [14]:

import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"), "../../")))

from config import settings
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim
import os
from src.config.spark_config import get_spark_session
custom_tuning = {
    "spark.executor.memory": "6g",
    "spark.driver.memory": "4g",
    "spark.executor.cores": "4"
}
spark = get_spark_session(app_name="Read_Parquet_Tuning", custom_config=custom_tuning)




25/06/22 23:34:04 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [15]:
parquet_base = os.path.join(settings.BASE_DIR, "data", "parquet_data")

parquet_files = {
    "title_basics": os.path.join(parquet_base, "title_basics.parquet"),
    "title_crew": os.path.join(parquet_base, "title_crew.parquet"),
    "title_episode": os.path.join(parquet_base, "title_episode.parquet"),
    "title_akas": os.path.join(parquet_base, "title_akas.parquet")
}


In [18]:
from pyspark.sql.functions import col

for name, path in parquet_files.items():
    print(f"📄 Reading {name}")

    # Pushdown predicate
    df = spark.read \
        .option("spark.sql.parquet.filterPushdown", "true") \
        .parquet(path)

    # Filter as early as possible
    if "titleType" in df.columns:
        df = df.filter(col("titleType") == "movie")

    # Print schema and show top rows
    df.printSchema()
    df.show(5)

    # Example: count to test lazy eval and plan
    print("🔢 Record count:", df.count())

    # Performance plan
    df.explain(mode="formatted")

📄 Reading title_basics


ConnectionRefusedError: [Errno 61] Connection refused

In [None]:

df = spark.read.parquet(parquet_path)
print("✅ Parquet loaded")

# STEP 3: Basic Exploration
df.printSchema()
df.show(5)

# STEP 4: Cache and Explain
df.cache()
df.explain()

# STEP 5: Example Tuning Ops
print("→ Original Partitions:", df.rdd.getNumPartitions())
df_repart = df.repartition(8)
print("→ After Repartition:", df_repart.rdd.getNumPartitions())

# Persist if reused often
df_repart.persist()

# Sample query
df_repart.select("primaryTitle", "genre_1").filter("genre_1 = 'Comedy'").show(5)