In [1]:
from pyspark.sql import SparkSession

In [13]:
from pyspark.sql.functions import col, concat, lit, broadcast, when

In [2]:
spark = SparkSession.builder \
    .appName("PySpark Optimization Techniques") \
    .getOrCreate()

In [3]:
data = [("A", 25, "F"), ("B", 30, "M"), ("C", 28, "F"), ("D", 35, "M")]
columns = ["name", "age", "gender"]
df = spark.createDataFrame(data, columns)

In [4]:
df.cache()

DataFrame[name: string, age: bigint, gender: string]

In [5]:
from pyspark.storagelevel import StorageLevel
df.persist(StorageLevel.MEMORY_AND_DISK)

DataFrame[name: string, age: bigint, gender: string]

In [6]:
df.show()
df.count()

+----+---+------+
|name|age|gender|
+----+---+------+
|   A| 25|     F|
|   B| 30|     M|
|   C| 28|     F|
|   D| 35|     M|
+----+---+------+



4

In [7]:
df.unpersist()

DataFrame[name: string, age: bigint, gender: string]

In [8]:
print(f"Initial partitions: {df.rdd.getNumPartitions()}")

Initial partitions: 2


In [9]:
df_repartitioned = df.repartition(4)
print(f"After repartition: {df_repartitioned.rdd.getNumPartitions()}")

After repartition: 4


In [10]:
df_coalesced = df_repartitioned.coalesce(2)
print(f"After coalesce: {df_coalesced.rdd.getNumPartitions()}")

After coalesce: 2


In [11]:
df_repartitioned.show()

+----+---+------+
|name|age|gender|
+----+---+------+
|   B| 30|     M|
|   D| 35|     M|
|   A| 25|     F|
|   C| 28|     F|
+----+---+------+



In [16]:
filtered_df = df.filter(col("age") > 25).select("name", "age")
filtered_df.show()

+----+---+
|name|age|
+----+---+
|   B| 30|
|   C| 28|
|   D| 35|
+----+---+



In [17]:
lazy_df = df.filter(col("age") > 20).groupBy("gender").count()