In [0]:
data = [
    (1, "Alice", 25, None),  
    (2, "Bob", None, 3000),  
    (3, None, 30, 4000),  
    (4, "David", 40, None),  
    (5, "Eve", None, None)  
]
columns = ["id", "name", "age", "salary"]

df = spark.createDataFrame(data, columns)

df.show()

In [0]:
# Checking NULLs in a specific column
df.filter(df.name.isNull()).show()
df.filter(df.age.isNull()).show()
df.filter(df["salary"].isNull()).show()

In [0]:
from pyspark.sql.functions import col, count, when
# Counting NULL values in all columns
df.select([count(when(col(c).isNull(),1).otherwise(None)).alias(c) for c in df.columns]).show()
df.select([sum(when(col(c).isNull(),1).otherwise(0)).alias(c) for c in df.columns]).show()

In [0]:
# Remove rows where any column has NULL
df.na.drop().show()
df.na.drop(how="any").show()
df.dropna(how="all").show()

In [0]:
# Remove rows where any column has NULL
df.dropna(thresh=2).show()

In [0]:
from pyspark.sql.functions import mean,coalesce,lit
# Fill NULLs with a default value
df.fillna({"age":98,"salary":3500,"name":"MyName"}).show()

mean_salary = df.select(mean("salary")).collect()[0][0]
df.fillna(mean_salary, subset = ["salary"]).show()

In [0]:
# Replacing NULLs Using Another Column
df.withColumn("name", coalesce(col("name"),lit("Unknown")))\
    .withColumn("salary", coalesce(col("salary"),col("age")*100)).show()

In [0]:
from pyspark.sql.functions import mean,coalesce,lit,last, first
from pyspark.sql.window import Window
import sys
# Forward Fill (Previous Non-NULL Value)

w = Window.orderBy(col("id")).rowsBetween(-sys.maxsize, 0)
df.withColumn("age", coalesce(col("age"), last("age", True).over(w))).show()

In [0]:
# Backward Fill (Next Non-NULL Value)

window_spec = Window.orderBy("id").rowsBetween(0, sys.maxsize)

# Apply Backward Fill
df_bfill = df.withColumn("age_bfill", first("age", True).over(window_spec))

df_filled.show()