# Handle Null Values
- Drop Columns
- Drop rows
- Various parameters in Dropping functionalities
- Handling missing values by mean


In [88]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Handle Null Values").getOrCreate()

spark

In [90]:
df = spark.read.option('inferSchema', 'true').option('sep', '\t').option('header', 'true').csv('data/testv.csv')
# df.show(10, truncate=False)  # Show the first 10 rows of the DataFrame without truncating the output

# df.write.csv('./data1.csv', header=True, mode='overwrite')  # Write the DataFrame to a CSV file with header and overwrite mode

In [91]:
from pyspark.sql.functions import col, lit, concat

new_df = df.select(
    col('MSISDN'),
    col('NID'),
    col('EMAIL'),
    col('POSTPAID_TARIFF')
).withColumn(
    'PHONE_NUMBER',
    concat(lit('230'), col('MSISDN'))
).withColumn(
    'GEN_NID',
    col('NID').cast('string')
).withColumn(
    'new',
    col('NID').cast('integer')
)

# Select specific columns and show the first 10 rows without truncating the output

new_df.printSchema()  # Print the schema of the DataFrame to see the structure and data types
# new_df.show(5, truncate=False)

root
 |-- MSISDN: integer (nullable = true)
 |-- NID: string (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- POSTPAID_TARIFF: string (nullable = true)
 |-- PHONE_NUMBER: string (nullable = true)
 |-- GEN_NID: string (nullable = true)
 |-- new: integer (nullable = true)



In [92]:
dropped_df = new_df.drop('GEN_NID')
# dropped_df.show(5, truncate=False)  # Show the first 5 rows of the DataFrame after dropping the 'GEN_NID' column

-------------------------------------------------------------
# 📌 Handling NULL (EMPTY) Values in PySpark
-------------------------------------------------------------

Null (or None) values represent missing or undefined data in a DataFrame.
They can lead to incorrect results during analysis or computation.
Best practice is to clean them by removing, replacing, or filling them with values.

In PySpark, you can handle nulls via:
1. DataFrame methods: df.drop(), df.fill(), df.replace(); This method is a wrapper around the df.na.method()
2. DataFrame.na submodule: df.na.drop(), df.na.fill(), df.na.replace

**Note**: They are used the same way and you can decide whichever one to use, they do the same thing and are used same way. The df.method() is a wrapper around the df.na.method();
But try to stick to the df.na.method() style.


### 🔸 Drop Rows with Nulls: using df.Method() and df.na.method()
---

In [95]:
# ✅ Using df.na.drop()
# dropped_df.na.drop().show()                   # Drop rows with any null; Same as df.na.drop(how='any').show()
# dropped_df.na.drop(how="all").show()          # Drop rows where all columns are null
# dropped_df.na.drop(subset=["email"]).show()   # Drop rows where 'email' field is null
dropped_df.na.drop(thresh=2).show()             # Keeps rows with at least 2 or more non-null values; Rows with less than 2 non-values are removed



# ✅ Using df.dropna() (direct); Absolutely same way you use the df.na.drop
# dropped_df.dropna().show()
# dropped_df.dropna(how="all").show()
# dropped_df.dropna(subset=["email"]).show()
# dropped_df.dropna(thresh=2).show()


+--------+--------------+--------------------+---------------+------------+----+
|  MSISDN|           NID|               EMAIL|POSTPAID_TARIFF|PHONE_NUMBER| new|
+--------+--------------+--------------------+---------------+------------+----+
|54924133|J0711874907942|JEANPOMPIER0517@G...|           NULL| 23054924133|NULL|
|54846497|B160985220041C|                NULL|           NULL| 23054846497|NULL|
|57369115|      S0987933|                NULL|           NULL| 23057369115|NULL|
|57113437|D090290300772B|                NULL|           NULL| 23057113437|NULL|
|58468805|N110561330075F|                NULL|           NULL| 23058468805|NULL|
|57228141|     A03228083|                NULL|           NULL| 23057228141|NULL|
|57119226|P0502893808515|                NULL|           NULL| 23057119226|NULL|
|57118074|M1503654401680|                NULL|           NULL| 23057118074|NULL|
|54881304|J080295290173A|                NULL|           NULL| 23054881304|NULL|
|57320678|L100783300094F|   

### 🔸 Fill Null Values
---

Replace NULL values with static value, make sure the value matches the column's data type.
This method does not fill/replace a value based on computation or expression, instead use the withColumn and when().then() / coalesce().


In [204]:

# ✅ Using df.na.fill()
# dropped_df.na.fill(0).show()             # Fill all numeric columns with null values with integer value 0
# dropped_df.na.fill("Unknown").show()     # Fill all string columns with null values with string value "Unknown"


# Fill specified column's (dict key) null value with a static value (dict value)
dropped_df.na.fill({"EMAIL":"test@gmail.com", "POSTPAID_TARIFF": "EMPTY", "NEW": 0.0}).show()    


# ✅ Using df.fillna(): Same as the df.na.fill above
# dropped_df.fillna(0).show()
# dropped_df.fillna("Unknown").show()
# dropped_df.fillna({"EMAIL":"test@gmail.com", "POSTPAID_TARIFF": "EMPTY", "NEW": 0.0}).show()


+--------+--------------+--------------------+---------------+------------+---+
|  MSISDN|           NID|               EMAIL|POSTPAID_TARIFF|PHONE_NUMBER|new|
+--------+--------------+--------------------+---------------+------------+---+
|54924133|J0711874907942|JEANPOMPIER0517@G...|          EMPTY| 23054924133|  0|
|54846497|B160985220041C|      test@gmail.com|          EMPTY| 23054846497|  0|
|57369115|      S0987933|      test@gmail.com|          EMPTY| 23057369115|  0|
|57113437|D090290300772B|      test@gmail.com|          EMPTY| 23057113437|  0|
|58468805|N110561330075F|      test@gmail.com|          EMPTY| 23058468805|  0|
|57228141|     A03228083|      test@gmail.com|          EMPTY| 23057228141|  0|
|57119226|P0502893808515|      test@gmail.com|          EMPTY| 23057119226|  0|
|57118074|M1503654401680|      test@gmail.com|          EMPTY| 23057118074|  0|
|54881304|J080295290173A|      test@gmail.com|          EMPTY| 23054881304|  0|
|57320678|L100783300094F|      test@gmai

### 🔸 Replace Specific Values,
---

Used to replace explicit values (non-null), across the dataframe; 
That is you can replace 'Wisdom' to something else, or 2302 to 0 or -00 ...
- It is specifically used for changing a value to another value or Null, 
- it is not used for changing NULL to a different value (even if it has same type). Use na.fill() or df.fill() instead.

In [289]:
# ✅ Using df.na.replace()
from pyspark.sql.functions import col

# Basic usage
# dropped_df.na.replace("B160985220041C", None)

# Complex usage: the length of 'to_replace' must match lenght or 'value' 
#   index 0 in 'to_replace' list represents index 0 in 'value' list on subset (column) NID and PHONE_NUMBER.
dropped_df.na.replace(
    to_replace=["S0987933", "23054924133"], value=["Value1", "Value2"], subset=["NID", "PHONE_NUMBER"]
).show(10, vertical=False)
# .collec()

# ✅ Using df.replace()
# dropped_df.replace("B160985220041C", None)
# dropped_df.replace(to_replace=["S0987933", "23054924133"], value=["Value1", "Value2"], subset=["NID", "PHONE_NUMBER"]).show()

+--------+--------------+--------------------+---------------+------------+----+
|  MSISDN|           NID|               EMAIL|POSTPAID_TARIFF|PHONE_NUMBER| new|
+--------+--------------+--------------------+---------------+------------+----+
|54924133|J0711874907942|JEANPOMPIER0517@G...|           NULL|      Value2|NULL|
|54846497|B160985220041C|                NULL|           NULL| 23054846497|NULL|
|57369115|        Value1|                NULL|           NULL| 23057369115|NULL|
|57113437|D090290300772B|                NULL|           NULL| 23057113437|NULL|
|58468805|N110561330075F|                NULL|           NULL| 23058468805|NULL|
|57228141|     A03228083|                NULL|           NULL| 23057228141|NULL|
|57119226|P0502893808515|                NULL|           NULL| 23057119226|NULL|
|57118074|M1503654401680|                NULL|           NULL| 23057118074|NULL|
|54881304|J080295290173A|                NULL|           NULL| 23054881304|NULL|
|57320678|L100783300094F|   

### 🔸 Using SQL Functions for Conditional Fill
----


In [281]:
from pyspark.sql.functions import when, col, coalesce

# Replace nulls in 'email' columns using condition
dropped_df.withColumn(
    "EMAIL",
    col=when(col("EMAIL").isNull(), "sample@test.com").otherwise(col("EMAIL"))
).withColumn(
    "POSTPAID_TARIFF",
    when(col("POSTPAID_TARIFF").isNull(), "NONE").otherwise(col("POSTPAID_TARIFF"))
).show()


# Choose first non-null from multiple columns
#    coalesce() returns the first non-null value. coalesce(NULL, NULL, 1, 2) => 1

dropped_df.withColumn(
    "EMAIL",
    coalesce(col("EMAIL"), col("PHONE_NUMBER"))
).show(3)

# ----------------------------
# ✅ Quick Summary
# ----------------------------

# Method              | Description
#---------------------|--------------------------------------------
# df.na.drop()        | Drop rows with nulls
# df.na.fill()        | Fill nulls with value or dict
# df.na.replace()     | Replace values (can include null)
# df.dropna()         | Shorthand for df.na.drop()
# df.fillna()         | Shorthand for df.na.fill()
# df.replace()        | Shorthand for df.na.replace()
# when().then() / coalesce()     | Conditional null handling

# # ----------------------------
# # 🧪 Tips
# # ----------------------------

# from pyspark.sql.functions import count, when

# - Always check schema: df.printSchema()
# - Profile nulls per column: df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
# - Use coalesce() to handle priority fallback among columns

# # Check rows with any nulls
# dropped_df.filter("email IS NULL").show()
# dropped_df.select(col("EMAIL")).filter("EMAIL is NULL").count()



+--------+--------------+--------------------+---------------+------------+----+
|  MSISDN|           NID|               EMAIL|POSTPAID_TARIFF|PHONE_NUMBER| new|
+--------+--------------+--------------------+---------------+------------+----+
|54924133|J0711874907942|JEANPOMPIER0517@G...|           NONE| 23054924133|NULL|
|54846497|B160985220041C|     sample@test.com|           NONE| 23054846497|NULL|
|57369115|      S0987933|     sample@test.com|           NONE| 23057369115|NULL|
|57113437|D090290300772B|     sample@test.com|           NONE| 23057113437|NULL|
|58468805|N110561330075F|     sample@test.com|           NONE| 23058468805|NULL|
|57228141|     A03228083|     sample@test.com|           NONE| 23057228141|NULL|
|57119226|P0502893808515|     sample@test.com|           NONE| 23057119226|NULL|
|57118074|M1503654401680|     sample@test.com|           NONE| 23057118074|NULL|
|54881304|J080295290173A|     sample@test.com|           NONE| 23054881304|NULL|
|57320678|L100783300094F|   

17