<a href="https://colab.research.google.com/github/gvikas79/Spark-Tutorials/blob/main/Working_with_Null_values.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Spark Introduction")
    .master("local[*]")
    .getOrCreate()
)

In [None]:
data = spark.read.format("csv").option("header", True).option("inferSchema", True).option("mode", "PERMISSIVE").load("netflix_titles.csv")

In [None]:
data.show()

In [None]:
#Count null / NaN values per column

from pyspark.sql.functions import col, sum, isnan, when

data.select([
    sum(when(col(c).isNull() | isnan(col(c)), 1).otherwise(0)).alias(c)
    for c in data.columns]).show()


In [None]:
#Filter rows having any null or NaN
from pyspark.sql.functions import col, isnan

data.filter(
    data.select([col(c).isNull() | isnan(col(c)) for c in data.columns])
      .reduce(lambda x, y: x | y)).show()
#Shows only rows where at least one column has null/NaN.

In [None]:
#Filter rows with null in a specific column
data.filter(col("director").isNull() | isnan(col("director"))).show()

In [None]:
#Get total number of missing values in the whole DataFrame
from pyspark.sql.functions import col, sum, isnan, when

missing_count = data.select([
    sum(when(col(c).isNull() | isnan(col(c)), 1).otherwise(0))
    for c in data.columns
]).rdd.flatMap(lambda x: x).sum()

print(f"Total missing values: {missing_count}")


In [None]:
# Replace all nulls with a specific value (e.g., 0)

df_filled = df.fillna(0)


In [None]:
# Replace nulls with different types (string vs numeric)

df_filled = df.fillna({"name": "Unknown", "age": 0})


In [None]:
#Replace null values in a specific column

df_filled = df.fillna({"age": 0})


In [None]:
#Replace null using na.fill() (alias for fillna)

df_filled = df.na.fill("missing")     # replace nulls in all string cols
df_filled = df.na.fill(0)             # replace nulls in all numeric cols
df_filled = df.na.fill({"city": "Unknown", "salary": 0})


In [None]:
#Replace null values with a computed value
#Sometimes you want to replace null with a mean, median, or mode:

from pyspark.sql.functions import col, mean

# Example: replace nulls in "salary" with mean salary
mean_val = df.select(mean(col("salary"))).collect()[0][0]

df_filled = df.na.fill({"salary": mean_val})


In [None]:
#Replace null values using when + otherwise

from pyspark.sql.functions import col, when

df_filled = df.withColumn(
    "age",
    when(col("age").isNull(), 0).otherwise(col("age"))
)


In [None]:
#Replace NaN values with a constant

from pyspark.sql.functions import when, col, isnan

df_filled = df.withColumn(
    "age",
    when(col("age").isNull() | isnan(col("age")), 0).otherwise(col("age"))
)
#This replaces both null and NaN in the age column with 0.

In [None]:
#Replace across multiple columns

from pyspark.sql.functions import when, col, isnan

for c in ["age", "salary"]:
    df = df.withColumn(
        c,
        when(col(c).isNull() | isnan(col(c)), 0).otherwise(col(c))
    )


In [None]:
#Use na.fill() for null + na.replace() for NaN
# Replace nulls

df = df.na.fill({"age": 0, "salary": 0})

# Replace NaN values with 0

df = df.na.replace(float("nan"), 0)


In [None]:
#One-liner function for any DataFrame

from pyspark.sql.functions import when, col, isnan

def replace_null_nan(df, replacements: dict):
    for c, val in replacements.items():
        df = df.withColumn(
            c,
            when(col(c).isNull() | isnan(col(c)), val).otherwise(col(c))
        )
    return df

# Example usage

df_filled = replace_null_nan(df, {"age": 0, "salary": 0, "city": "Unknown"})
