### Handling NULL Values using dropna() and fillna() in Databricks

- dropna()
- fillna()
- isNull()

### Create Sample Data with NULL Values

In [0]:

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
# sample data with null values
data = [
    (101, "Laptop",    2,   750.50),
    (102, "Mouse",     None, 25.00),     # quantity is null
    (103, None,        5,   None),        # product_name & price are null
    (None, "Keyboard", 3,   45.99),       # product_id is null
    (None, None,       None, None)        # entire row null
]
# define schema
schema = StructType([
    StructField("product_id",   IntegerType(), True),
    StructField("product_name", StringType(),  True),
    StructField("quantity",     IntegerType(), True),
    StructField("price",        DoubleType(),  True)
])
# create DataFrame
df = spark.createDataFrame(data, schema)
df.display()


product_id,product_name,quantity,price
101.0,Laptop,2.0,750.5
102.0,Mouse,,25.0
103.0,,5.0,
,Keyboard,3.0,45.99
,,,


### Understanding dropna()

**DataFrame.dropna(how='any', thresh=None, subset=None)**

In [0]:
df.dropna().display()

product_id,product_name,quantity,price
101,Laptop,2,750.5


In [0]:
df.dropna(how = 'any').display()

product_id,product_name,quantity,price
101,Laptop,2,750.5


In [0]:
df.dropna(how = 'all').display()

product_id,product_name,quantity,price
101.0,Laptop,2.0,750.5
102.0,Mouse,,25.0
103.0,,5.0,
,Keyboard,3.0,45.99


In [0]:
df.dropna(thresh = 1).display()

product_id,product_name,quantity,price
101.0,Laptop,2.0,750.5
102.0,Mouse,,25.0
103.0,,5.0,
,Keyboard,3.0,45.99


In [0]:
df.dropna(thresh = 3).display()

product_id,product_name,quantity,price
101.0,Laptop,2.0,750.5
102.0,Mouse,,25.0
,Keyboard,3.0,45.99


In [0]:
df.dropna(subset = ['Product_name','quantity']).display()

product_id,product_name,quantity,price
101.0,Laptop,2,750.5
,Keyboard,3,45.99


## Filling NULL Values using fillna()

In [0]:
# Fill null values
df.fillna({
    "product_name": "UNKNOWN",
    "quantity": 0,
    "price": 0.0
}).display()

product_id,product_name,quantity,price
101.0,Laptop,2,750.5
102.0,Mouse,0,25.0
103.0,UNKNOWN,5,0.0
,Keyboard,3,45.99
,UNKNOWN,0,0.0


### Filtering NULL Values

In [0]:
from pyspark.sql.functions import col
df.filter(col("product_id").isNull()).display()

product_id,product_name,quantity,price
,Keyboard,3.0,45.99
,,,
